From a539495d67385ffd79994955efd423fa20a2e50b Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Thu, 5 Dec 2024 23:42:33 -0500 Subject: [PATCH] Working cublas batched example (#60) --- codegen/annotations.h | 502 +++--- codegen/codegen.py | 122 +- codegen/gen_api.h | 272 ++-- codegen/gen_client.cpp | 1358 ++++++++++++++-- codegen/gen_server.cpp | 3162 +++++++++++++++++++++++++++++++------ codegen/manual_server.cpp | 2 +- local.sh | 26 +- test/cublas_batched.cu | 196 +++ test/cublas_utils.h | 351 ++++ 9 files changed, 4864 insertions(+), 1127 deletions(-) create mode 100644 test/cublas_batched.cu create mode 100644 test/cublas_utils.h diff --git a/codegen/annotations.h b/codegen/annotations.h index f39ae0b..ddaf798 100644 --- a/codegen/annotations.h +++ b/codegen/annotations.h @@ -12141,6 +12141,7 @@ cublasStatus_t cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl */ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12148,17 +12149,17 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* const Aarray[], int lda, const __half* const Barray[], int ldb, const __half* beta, __half* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12166,17 +12167,17 @@ cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t trans * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* const Aarray[], int64_t lda, const __half* const Barray[], int64_t ldb, const __half* beta, __half* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12184,17 +12185,17 @@ cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* const Aarray[], int lda, const float* const Barray[], int ldb, const float* beta, float* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY @@ -12202,158 +12203,157 @@ cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t trans * @param n SEND_ONLY * @param k SEND_ONLY * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* const Aarray[], int64_t lda, const float* const Barray[], int64_t ldb, const float* beta, float* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* const Aarray[], int64_t lda, const double* const Barray[], int64_t ldb, const double* beta, double* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const Barray[], int ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int ldc, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int64_t lda, const cuDoubleComplex* const Barray[], int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int64_t ldc, int64_t batchCount); /** @@ -12363,14 +12363,14 @@ cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12384,14 +12384,14 @@ cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_ * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12405,14 +12405,14 @@ cublasStatus_t cublasHgemmStridedBatched_64(cublasHandle_t handle, cublasOperati * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12426,14 +12426,14 @@ cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_ * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12447,14 +12447,14 @@ cublasStatus_t cublasSgemmStridedBatched_64(cublasHandle_t handle, cublasOperati * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12468,14 +12468,14 @@ cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_ * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12489,14 +12489,14 @@ cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperati * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12510,14 +12510,14 @@ cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle, cublasOperation_ * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12531,14 +12531,14 @@ cublasStatus_t cublasCgemmStridedBatched_64(cublasHandle_t handle, cublasOperati * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12552,14 +12552,14 @@ cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle, cublasOperatio * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12573,14 +12573,14 @@ cublasStatus_t cublasCgemm3mStridedBatched_64(cublasHandle_t handle, cublasOpera * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12594,14 +12594,14 @@ cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_ * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY * @param strideA SEND_ONLY * @param B SEND_RECV * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param ldc SEND_ONLY * @param strideC SEND_ONLY @@ -12609,47 +12609,48 @@ cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_ */ cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* B, int64_t ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc, long long int strideC, int64_t batchCount); /** + * @disabled + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param Atype SEND_ONLY * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param Btype SEND_ONLY * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param Ctype SEND_ONLY * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY * @param computeType SEND_ONLY * @param algo SEND_ONLY */ cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int lda, const void* const Barray[], cudaDataType Btype, int ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int ldc, int batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param transa SEND_ONLY * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param Aarray SEND_ONLY LENGTH:batchCount * @param Atype SEND_ONLY * @param lda SEND_ONLY - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchCount * @param Btype SEND_ONLY * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY + * @param beta SEND_RECV NULLABLE + * @param Carray SEND_ONLY LENGTH:batchCount * @param Ctype SEND_ONLY * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY * @param computeType SEND_ONLY * @param algo SEND_ONLY */ @@ -12661,7 +12662,7 @@ cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t t * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param Atype SEND_ONLY * @param lda SEND_ONLY @@ -12670,7 +12671,7 @@ cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t t * @param Btype SEND_ONLY * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param Ctype SEND_ONLY * @param ldc SEND_ONLY @@ -12687,7 +12688,7 @@ cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle, cublasOperation * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param Atype SEND_ONLY * @param lda SEND_ONLY @@ -12696,7 +12697,7 @@ cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle, cublasOperation * @param Btype SEND_ONLY * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param Ctype SEND_ONLY * @param ldc SEND_ONLY @@ -12712,10 +12713,10 @@ cublasStatus_t cublasGemmStridedBatchedEx_64(cublasHandle_t handle, cublasOperat * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param B SEND_RECV * @param ldb SEND_ONLY * @param C SEND_RECV @@ -12728,10 +12729,10 @@ cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cubl * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param B SEND_RECV * @param ldb SEND_ONLY * @param C SEND_RECV @@ -12744,10 +12745,10 @@ cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, c * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param B SEND_RECV * @param ldb SEND_ONLY * @param C SEND_RECV @@ -12760,10 +12761,10 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cubl * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param B SEND_RECV * @param ldb SEND_ONLY * @param C SEND_RECV @@ -12776,10 +12777,10 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, c * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param B SEND_RECV * @param ldb SEND_ONLY * @param C SEND_RECV @@ -12792,10 +12793,10 @@ cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, cubl * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param B SEND_RECV * @param ldb SEND_ONLY * @param C SEND_RECV @@ -12808,10 +12809,10 @@ cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, c * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param B SEND_RECV * @param ldb SEND_ONLY * @param C SEND_RECV @@ -12824,10 +12825,10 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cubl * @param transb SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param B SEND_RECV * @param ldb SEND_ONLY * @param C SEND_RECV @@ -12835,6 +12836,7 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cubl */ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12842,15 +12844,15 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, c * @param diag SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* const A[], int lda, float* const B[], int ldb, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12858,15 +12860,15 @@ cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, * @param diag SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* const A[], int64_t lda, float* const B[], int64_t ldb, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12874,15 +12876,15 @@ cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid * @param diag SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* const A[], int lda, double* const B[], int ldb, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12890,15 +12892,15 @@ cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, * @param diag SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* const A[], int64_t lda, double* const B[], int64_t ldb, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12906,15 +12908,15 @@ cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid * @param diag SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* const A[], int lda, cuComplex* const B[], int ldb, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12922,15 +12924,15 @@ cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, * @param diag SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* const A[], int64_t lda, cuComplex* const B[], int64_t ldb, int64_t batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12938,15 +12940,15 @@ cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid * @param diag SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const B[], int ldb, int batchCount); /** + * @param batchCount SEND_ONLY * @param handle SEND_ONLY * @param side SEND_ONLY * @param uplo SEND_ONLY @@ -12954,12 +12956,11 @@ cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, * @param diag SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param alpha SEND_RECV - * @param A SEND_ONLY + * @param alpha SEND_RECV NULLABLE + * @param A SEND_ONLY LENGTH:batchCount * @param lda SEND_ONLY - * @param B SEND_ONLY + * @param B SEND_ONLY LENGTH:batchCount * @param ldb SEND_ONLY - * @param batchCount SEND_ONLY */ cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int64_t lda, cuDoubleComplex* const B[], int64_t ldb, int64_t batchCount); /** @@ -12969,7 +12970,7 @@ cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid * @param n SEND_ONLY * @param A SEND_RECV * @param lda SEND_ONLY - * @param x SEND_RECV + * @param x SEND_RECV NULLABLE * @param incx SEND_ONLY * @param C SEND_RECV * @param ldc SEND_ONLY @@ -12982,7 +12983,7 @@ cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, * @param n SEND_ONLY * @param A SEND_RECV * @param lda SEND_ONLY - * @param x SEND_RECV + * @param x SEND_RECV NULLABLE * @param incx SEND_ONLY * @param C SEND_RECV * @param ldc SEND_ONLY @@ -12995,7 +12996,7 @@ cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 * @param n SEND_ONLY * @param A SEND_RECV * @param lda SEND_ONLY - * @param x SEND_RECV + * @param x SEND_RECV NULLABLE * @param incx SEND_ONLY * @param C SEND_RECV * @param ldc SEND_ONLY @@ -13008,7 +13009,7 @@ cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, * @param n SEND_ONLY * @param A SEND_RECV * @param lda SEND_ONLY - * @param x SEND_RECV + * @param x SEND_RECV NULLABLE * @param incx SEND_ONLY * @param C SEND_RECV * @param ldc SEND_ONLY @@ -13021,7 +13022,7 @@ cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 * @param n SEND_ONLY * @param A SEND_RECV * @param lda SEND_ONLY - * @param x SEND_RECV + * @param x SEND_RECV NULLABLE * @param incx SEND_ONLY * @param C SEND_RECV * @param ldc SEND_ONLY @@ -13034,7 +13035,7 @@ cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, * @param n SEND_ONLY * @param A SEND_RECV * @param lda SEND_ONLY - * @param x SEND_RECV + * @param x SEND_RECV NULLABLE * @param incx SEND_ONLY * @param C SEND_RECV * @param ldc SEND_ONLY @@ -13047,7 +13048,7 @@ cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 * @param n SEND_ONLY * @param A SEND_RECV * @param lda SEND_ONLY - * @param x SEND_RECV + * @param x SEND_RECV NULLABLE * @param incx SEND_ONLY * @param C SEND_RECV * @param ldc SEND_ONLY @@ -13060,165 +13061,165 @@ cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, * @param n SEND_ONLY * @param A SEND_RECV * @param lda SEND_ONLY - * @param x SEND_RECV + * @param x SEND_RECV NULLABLE * @param incx SEND_ONLY * @param C SEND_RECV * @param ldc SEND_ONLY */ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* C, int64_t ldc); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Ainv SEND_ONLY + * @param Ainv SEND_ONLY LENGTH:batchSize * @param lda_inv SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* const A[], int lda, float* const Ainv[], int lda_inv, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Ainv SEND_ONLY + * @param Ainv SEND_ONLY LENGTH:batchSize * @param lda_inv SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* const A[], int lda, double* const Ainv[], int lda_inv, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Ainv SEND_ONLY + * @param Ainv SEND_ONLY LENGTH:batchSize * @param lda_inv SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, cuComplex* const Ainv[], int lda_inv, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Ainv SEND_ONLY + * @param Ainv SEND_ONLY LENGTH:batchSize * @param lda_inv SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const Ainv[], int lda_inv, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param TauArray SEND_ONLY + * @param TauArray SEND_ONLY LENGTH:batchSize * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* const Aarray[], int lda, float* const TauArray[], int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param TauArray SEND_ONLY + * @param TauArray SEND_ONLY LENGTH:batchSize * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* const Aarray[], int lda, double* const TauArray[], int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param TauArray SEND_ONLY + * @param TauArray SEND_ONLY LENGTH:batchSize * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComplex* const Aarray[], int lda, cuComplex* const TauArray[], int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param TauArray SEND_ONLY + * @param TauArray SEND_ONLY LENGTH:batchSize * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const TauArray[], int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV * @param devInfoArray SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, float* const Aarray[], int lda, float* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV * @param devInfoArray SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, double* const Aarray[], int lda, double* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV * @param devInfoArray SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuComplex* const Aarray[], int lda, cuComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param m SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param Carray SEND_ONLY + * @param Carray SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV * @param devInfoArray SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize); /** * @param handle SEND_ONLY * @param uplo SEND_ONLY * @param n SEND_ONLY - * @param AP SEND_RECV + * @param AP SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY */ @@ -13227,7 +13228,7 @@ cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, * @param handle SEND_ONLY * @param uplo SEND_ONLY * @param n SEND_ONLY - * @param AP SEND_RECV + * @param AP SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY */ @@ -13236,7 +13237,7 @@ cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, * @param handle SEND_ONLY * @param uplo SEND_ONLY * @param n SEND_ONLY - * @param AP SEND_RECV + * @param AP SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY */ @@ -13245,7 +13246,7 @@ cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, * @param handle SEND_ONLY * @param uplo SEND_ONLY * @param n SEND_ONLY - * @param AP SEND_RECV + * @param AP SEND_RECV NULLABLE * @param A SEND_RECV * @param lda SEND_ONLY */ @@ -13254,7 +13255,7 @@ cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, * @param handle SEND_ONLY * @param uplo SEND_ONLY * @param n SEND_ONLY - * @param A SEND_RECV + * @param A SEND_RECV NULLABLE * @param lda SEND_ONLY * @param AP SEND_RECV */ @@ -13263,7 +13264,7 @@ cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, * @param handle SEND_ONLY * @param uplo SEND_ONLY * @param n SEND_ONLY - * @param A SEND_RECV + * @param A SEND_RECV NULLABLE * @param lda SEND_ONLY * @param AP SEND_RECV */ @@ -13272,7 +13273,7 @@ cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, * @param handle SEND_ONLY * @param uplo SEND_ONLY * @param n SEND_ONLY - * @param A SEND_RECV + * @param A SEND_RECV NULLABLE * @param lda SEND_ONLY * @param AP SEND_RECV */ @@ -13281,7 +13282,7 @@ cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, * @param handle SEND_ONLY * @param uplo SEND_ONLY * @param n SEND_ONLY - * @param A SEND_RECV + * @param A SEND_RECV NULLABLE * @param lda SEND_ONLY * @param AP SEND_RECV */ @@ -13327,107 +13328,107 @@ cublasStatus_t cublasCgetrfBatched(cublasHandle_t handle, int n, cuComplex* cons */ cublasStatus_t cublasZgetrfBatched(cublasHandle_t handle, int n, cuDoubleComplex* const A[], int lda, int* P, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param P SEND_RECV - * @param C SEND_ONLY + * @param P SEND_RECV NULLABLE + * @param C SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* const A[], int lda, const int* P, float* const C[], int ldc, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param P SEND_RECV - * @param C SEND_ONLY + * @param P SEND_RECV NULLABLE + * @param C SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* const A[], int lda, const int* P, double* const C[], int ldc, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param P SEND_RECV - * @param C SEND_ONLY + * @param P SEND_RECV NULLABLE + * @param C SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, const int* P, cuComplex* const C[], int ldc, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param n SEND_ONLY - * @param A SEND_ONLY + * @param A SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param P SEND_RECV - * @param C SEND_ONLY + * @param P SEND_RECV NULLABLE + * @param C SEND_ONLY LENGTH:batchSize * @param ldc SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, const int* P, cuDoubleComplex* const C[], int ldc, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param devIpiv SEND_RECV - * @param Barray SEND_ONLY + * @param devIpiv SEND_RECV NULLABLE + * @param Barray SEND_ONLY LENGTH:batchSize * @param ldb SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float* const Aarray[], int lda, const int* devIpiv, float* const Barray[], int ldb, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param devIpiv SEND_RECV - * @param Barray SEND_ONLY + * @param devIpiv SEND_RECV NULLABLE + * @param Barray SEND_ONLY LENGTH:batchSize * @param ldb SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const double* const Aarray[], int lda, const int* devIpiv, double* const Barray[], int ldb, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY * @param devIpiv SEND_RECV - * @param Barray SEND_ONLY + * @param Barray SEND_ONLY LENGTH:batchSize * @param ldb SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuComplex* const Aarray[], int lda, const int* devIpiv, cuComplex* const Barray[], int ldb, int* info, int batchSize); /** + * @param batchSize SEND_ONLY * @param handle SEND_ONLY * @param trans SEND_ONLY * @param n SEND_ONLY * @param nrhs SEND_ONLY - * @param Aarray SEND_ONLY + * @param Aarray SEND_ONLY LENGTH:batchSize * @param lda SEND_ONLY - * @param devIpiv SEND_RECV - * @param Barray SEND_ONLY + * @param devIpiv SEND_RECV NULLABLE + * @param Barray SEND_ONLY LENGTH:batchSize * @param ldb SEND_ONLY * @param info SEND_RECV - * @param batchSize SEND_ONLY */ cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuDoubleComplex* const Aarray[], int lda, const int* devIpiv, cuDoubleComplex* const Barray[], int ldb, int* info, int batchSize); /** @@ -13465,14 +13466,14 @@ cublasStatus_t cublasMigrateComputeType(cublasHandle_t handle, cudaDataType_t da * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param Atype SEND_ONLY * @param lda SEND_ONLY * @param B SEND_RECV * @param Btype SEND_ONLY * @param ldb SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param Ctype SEND_ONLY * @param ldc SEND_ONLY @@ -13487,30 +13488,7 @@ cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cub * @param m SEND_ONLY * @param n SEND_ONLY * @param k SEND_ONLY - * @param alpha SEND_RECV - * @param Aarray SEND_ONLY - * @param Atype SEND_ONLY - * @param lda SEND_ONLY - * @param Barray SEND_ONLY - * @param Btype SEND_ONLY - * @param ldb SEND_ONLY - * @param beta SEND_RECV - * @param Carray SEND_ONLY - * @param Ctype SEND_ONLY - * @param ldc SEND_ONLY - * @param batchCount SEND_ONLY - * @param computeType SEND_ONLY - * @param algo SEND_ONLY - */ -cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int lda, const void* const Barray[], cudaDataType Btype, int ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int ldc, int batchCount, cudaDataType computeType, cublasGemmAlgo_t algo); -/** - * @param handle SEND_ONLY - * @param transa SEND_ONLY - * @param transb SEND_ONLY - * @param m SEND_ONLY - * @param n SEND_ONLY - * @param k SEND_ONLY - * @param alpha SEND_RECV + * @param alpha SEND_RECV NULLABLE * @param A SEND_RECV * @param Atype SEND_ONLY * @param lda SEND_ONLY @@ -13519,7 +13497,7 @@ cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, cublasOperation_t tran * @param Btype SEND_ONLY * @param ldb SEND_ONLY * @param strideB SEND_ONLY - * @param beta SEND_RECV + * @param beta SEND_RECV NULLABLE * @param C SEND_RECV * @param Ctype SEND_ONLY * @param ldc SEND_ONLY diff --git a/codegen/codegen.py b/codegen/codegen.py index c91ac7a..65bc104 100644 --- a/codegen/codegen.py +++ b/codegen/codegen.py @@ -190,10 +190,9 @@ def client_rpc_write(self, f): # array length operations are handled differently than char elif isinstance(self.ptr, Array): f.write( - " rpc_write(0, {param_name}, sizeof({param_type}[{length}])) < 0 ||\n".format( + " rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format( param_name=self.parameter.name, - param_type=self.ptr.format().replace("[]", ""), - length=self.length.name, + param_type=self.parameter.name, ) ) else: @@ -216,7 +215,7 @@ def server_declaration(self) -> str: c = self.ptr.const self.ptr.const = False # const[] isn't a valid part of a variable declaration - s = f" {self.ptr.format().replace("const[]", "")}* {self.parameter.name} = new {self.ptr.format().replace("const[]", "")}[{self.length.name}];\n" + s = f" {self.ptr.format().replace("const[]", "")}* {self.parameter.name} = nullptr;\n" self.ptr.const = c else: c = self.ptr.ptr_to.const @@ -230,17 +229,16 @@ def server_rpc_read(self, f): return elif isinstance(self.length, int): f.write( - " rpc_read(conn, {param_name}, {size}) < 0 ||\n".format( + " rpc_read(conn, &{param_name}, {size}) < 0 ||\n".format( param_name=self.parameter.name, size=self.length, ) ) elif isinstance(self.ptr, Array): f.write( - " rpc_read(conn, {param_name}, sizeof({param_type}[{length}])) < 0 ||\n".format( + " rpc_read(conn, &{param_name}, sizeof({param_type})) < 0 ||\n".format( param_name=self.parameter.name, param_type=self.ptr.format().replace("[]", ""), - length=self.length.name, ) ) else: @@ -256,12 +254,6 @@ def server_rpc_read(self, f): ) ) - def server_len_rpc_read(self, f): - f.write(" if (rpc_read(conn, &{length_param}, sizeof(int)) < 0)\n".format( - length_param=self.length.name, - )) - f.write(" return -1;\n") - @property def server_reference(self) -> str: return self.parameter.name @@ -403,12 +395,13 @@ class OpaqueTypeOperation: def client_rpc_write(self, f): if not self.send: return - f.write( - " rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format( - param_name=self.parameter.name, - param_type=self.type_.format(), + else: + f.write( + " rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format( + param_name=self.parameter.name, + param_type=self.type_.format(), + ) ) - ) @property def server_declaration(self) -> str: @@ -418,7 +411,8 @@ def server_declaration(self) -> str: # but "const cudnnTensorDescriptor_t *xDesc" IS valid. This subtle change carries reprecussions. elif "const " in self.type_.format() and not "void" in self.type_.format() and not "*" in self.type_.format(): return f" {self.type_.format().replace("const", "")} {self.parameter.name};\n" - else: return f" {self.type_.format()} {self.parameter.name};\n" + else: + return f" {self.type_.format()} {self.parameter.name};\n" def server_rpc_read(self, f): if not self.send: @@ -703,7 +697,15 @@ def main(): functions_with_annotations: list[tuple[Function, Function, list[Operation]]] = [] + dupes = {} + for function in functions: + # ensure duplicate functions can't be written + if dupes.get(function.name.format()): + continue + + dupes[function.name.format()] = True + try: annotation = next( f for f in annotations.namespace.functions if f.name == function.name @@ -915,14 +917,6 @@ def main(): for function, annotation, operations, disabled in functions_with_annotations: if function.name.format() in MANUAL_IMPLEMENTATIONS or disabled: continue - batched = False - - # not a fan of this, but the batched functions are pretty standard with the flow below. - # batched functions are cublas functions that send pointer arrays where batchCount describes... - # the number of pointers in the arrays. This is non-trivial to generate. - if "Batched" in function.name.format(): - batched = True - # parse the annotation doxygen f.write( "int handle_{name}(void *conn)\n".format( @@ -933,70 +927,28 @@ def main(): defers = [] - if batched: - array_batches = [] - non_array_batches = [] - - for operation in operations: - if isinstance(operation, NullTerminatedOperation): - if error := operation.server_rpc_read(f, len(defers)): - defers.append(error) - if isinstance(operation, ArrayOperation): - array_batches.append(operation) - if not isinstance(operation, ArrayOperation): - non_array_batches.append(operation) - - # print our normal operations the same - for operation in operations: - if operation not in array_batches: - f.write(operation.server_declaration) - - # do something with array batches - if len(array_batches) > 0 and hasattr(array_batches[0], "server_len_rpc_read"): - array_batches[0].server_len_rpc_read(f) - - # pop here, because we already accounted for the batchCount integer - non_array_batches.pop(0) - - for op in array_batches: - f.write(op.server_declaration) - - f.write(" int request_id;\n") - if function.return_type.format() != "void": - f.write(" {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format())) - else: - f.write(" void* scuda_intercept_result;\n".format(return_type=function.return_type.format())) + for operation in operations: + f.write(operation.server_declaration) - f.write(" if (\n") - for operation in operations: - operation.server_rpc_read(f) - f.write(" false)\n") - f.write(" goto ERROR_{index};\n".format(index=len(defers))) + f.write(" int request_id;\n") - f.write("\n") + # we only generate return from non-void types + if function.return_type.format() != "void": + f.write(" {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format())) else: - for operation in operations: - f.write(operation.server_declaration) + f.write(" void* scuda_intercept_result;\n".format(return_type=function.return_type.format())) - f.write(" int request_id;\n") - - # we only generate return from non-void types - if function.return_type.format() != "void": - f.write(" {return_type} scuda_intercept_result;\n".format(return_type=function.return_type.format())) + f.write(" if (\n") + for operation in operations: + if isinstance(operation, NullTerminatedOperation): + if error := operation.server_rpc_read(f, len(defers)): + defers.append(error) else: - f.write(" void* scuda_intercept_result;\n".format(return_type=function.return_type.format())) - - f.write(" if (\n") - for operation in operations: - if isinstance(operation, NullTerminatedOperation): - if error := operation.server_rpc_read(f, len(defers)): - defers.append(error) - else: - operation.server_rpc_read(f) - f.write(" false)\n") - f.write(" goto ERROR_{index};\n".format(index=len(defers))) + operation.server_rpc_read(f) + f.write(" false)\n") + f.write(" goto ERROR_{index};\n".format(index=len(defers))) - f.write("\n") + f.write("\n") f.write( " request_id = rpc_end_request(conn);\n".format( diff --git a/codegen/gen_api.h b/codegen/gen_api.h index 8fb6410..2545cf2 100644 --- a/codegen/gen_api.h +++ b/codegen/gen_api.h @@ -1252,118 +1252,160 @@ #define RPC_cublasCtrmm_v2_64 1251 #define RPC_cublasZtrmm_v2 1252 #define RPC_cublasZtrmm_v2_64 1253 -#define RPC_cublasHgemmStridedBatched 1254 -#define RPC_cublasHgemmStridedBatched_64 1255 -#define RPC_cublasSgemmStridedBatched 1256 -#define RPC_cublasSgemmStridedBatched_64 1257 -#define RPC_cublasDgemmStridedBatched 1258 -#define RPC_cublasDgemmStridedBatched_64 1259 -#define RPC_cublasCgemmStridedBatched 1260 -#define RPC_cublasCgemmStridedBatched_64 1261 -#define RPC_cublasCgemm3mStridedBatched 1262 -#define RPC_cublasCgemm3mStridedBatched_64 1263 -#define RPC_cublasZgemmStridedBatched 1264 -#define RPC_cublasZgemmStridedBatched_64 1265 -#define RPC_cublasSgeam 1266 -#define RPC_cublasSgeam_64 1267 -#define RPC_cublasDgeam 1268 -#define RPC_cublasDgeam_64 1269 -#define RPC_cublasCgeam 1270 -#define RPC_cublasCgeam_64 1271 -#define RPC_cublasZgeam 1272 -#define RPC_cublasZgeam_64 1273 -#define RPC_cublasSdgmm 1274 -#define RPC_cublasSdgmm_64 1275 -#define RPC_cublasDdgmm 1276 -#define RPC_cublasDdgmm_64 1277 -#define RPC_cublasCdgmm 1278 -#define RPC_cublasCdgmm_64 1279 -#define RPC_cublasZdgmm 1280 -#define RPC_cublasZdgmm_64 1281 -#define RPC_cublasStpttr 1282 -#define RPC_cublasDtpttr 1283 -#define RPC_cublasCtpttr 1284 -#define RPC_cublasZtpttr 1285 -#define RPC_cublasStrttp 1286 -#define RPC_cublasDtrttp 1287 -#define RPC_cublasCtrttp 1288 -#define RPC_cublasZtrttp 1289 -#define RPC_cublasUint8gemmBias 1290 -#define RPC_cublasMigrateComputeType 1291 -#define RPC_cudnnGetVersion 1292 -#define RPC_cudnnGetMaxDeviceVersion 1293 -#define RPC_cudnnGetCudartVersion 1294 -#define RPC_cudnnGetErrorString 1295 -#define RPC_cudnnGetLastErrorString 1296 -#define RPC_cudnnQueryRuntimeError 1297 -#define RPC_cudnnGetProperty 1298 -#define RPC_cudnnCreate 1299 -#define RPC_cudnnDestroy 1300 -#define RPC_cudnnSetStream 1301 -#define RPC_cudnnGetStream 1302 -#define RPC_cudnnGetCallback 1303 -#define RPC_cudnnGraphVersionCheck 1304 -#define RPC_cudnnBackendCreateDescriptor 1305 -#define RPC_cudnnBackendDestroyDescriptor 1306 -#define RPC_cudnnBackendInitialize 1307 -#define RPC_cudnnBackendFinalize 1308 -#define RPC_cudnnBackendSetAttribute 1309 -#define RPC_cudnnBackendExecute 1310 -#define RPC_cudnnBackendPopulateCudaGraph 1311 -#define RPC_cudnnBackendUpdateCudaGraph 1312 -#define RPC_cudnnCreateTensorDescriptor 1313 -#define RPC_cudnnSetTensor4dDescriptor 1314 -#define RPC_cudnnSetTensor4dDescriptorEx 1315 -#define RPC_cudnnGetTensor4dDescriptor 1316 -#define RPC_cudnnGetTensorSizeInBytes 1317 -#define RPC_cudnnDestroyTensorDescriptor 1318 -#define RPC_cudnnInitTransformDest 1319 -#define RPC_cudnnCreateTensorTransformDescriptor 1320 -#define RPC_cudnnDestroyTensorTransformDescriptor 1321 -#define RPC_cudnnCreateOpTensorDescriptor 1322 -#define RPC_cudnnSetOpTensorDescriptor 1323 -#define RPC_cudnnGetOpTensorDescriptor 1324 -#define RPC_cudnnDestroyOpTensorDescriptor 1325 -#define RPC_cudnnCreateReduceTensorDescriptor 1326 -#define RPC_cudnnSetReduceTensorDescriptor 1327 -#define RPC_cudnnGetReduceTensorDescriptor 1328 -#define RPC_cudnnDestroyReduceTensorDescriptor 1329 -#define RPC_cudnnGetReductionIndicesSize 1330 -#define RPC_cudnnGetReductionWorkspaceSize 1331 -#define RPC_cudnnCreateFilterDescriptor 1332 -#define RPC_cudnnSetFilter4dDescriptor 1333 -#define RPC_cudnnGetFilter4dDescriptor 1334 -#define RPC_cudnnGetFilterSizeInBytes 1335 -#define RPC_cudnnDestroyFilterDescriptor 1336 -#define RPC_cudnnCreatePoolingDescriptor 1337 -#define RPC_cudnnSetPooling2dDescriptor 1338 -#define RPC_cudnnGetPooling2dDescriptor 1339 -#define RPC_cudnnGetPooling2dForwardOutputDim 1340 -#define RPC_cudnnDestroyPoolingDescriptor 1341 -#define RPC_cudnnCreateActivationDescriptor 1342 -#define RPC_cudnnSetActivationDescriptor 1343 -#define RPC_cudnnGetActivationDescriptor 1344 -#define RPC_cudnnSetActivationDescriptorSwishBeta 1345 -#define RPC_cudnnGetActivationDescriptorSwishBeta 1346 -#define RPC_cudnnDestroyActivationDescriptor 1347 -#define RPC_cudnnActivationForward 1348 -#define RPC_cudnnCreateLRNDescriptor 1349 -#define RPC_cudnnSetLRNDescriptor 1350 -#define RPC_cudnnGetLRNDescriptor 1351 -#define RPC_cudnnDestroyLRNDescriptor 1352 -#define RPC_cudnnDeriveBNTensorDescriptor 1353 -#define RPC_cudnnDeriveNormTensorDescriptor 1354 -#define RPC_cudnnCreateSpatialTransformerDescriptor 1355 -#define RPC_cudnnDestroySpatialTransformerDescriptor 1356 -#define RPC_cudnnCreateDropoutDescriptor 1357 -#define RPC_cudnnDestroyDropoutDescriptor 1358 -#define RPC_cudnnDropoutGetStatesSize 1359 -#define RPC_cudnnDropoutGetReserveSpaceSize 1360 -#define RPC_cudnnGetDropoutDescriptor 1361 -#define RPC_cudnnOpsVersionCheck 1362 -#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1363 -#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1364 -#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1365 -#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1366 -#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1367 -#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1368 +#define RPC_cublasHgemmBatched 1254 +#define RPC_cublasHgemmBatched_64 1255 +#define RPC_cublasSgemmBatched 1256 +#define RPC_cublasSgemmBatched_64 1257 +#define RPC_cublasDgemmBatched 1258 +#define RPC_cublasDgemmBatched_64 1259 +#define RPC_cublasCgemmBatched 1260 +#define RPC_cublasCgemmBatched_64 1261 +#define RPC_cublasCgemm3mBatched 1262 +#define RPC_cublasCgemm3mBatched_64 1263 +#define RPC_cublasZgemmBatched 1264 +#define RPC_cublasZgemmBatched_64 1265 +#define RPC_cublasHgemmStridedBatched 1266 +#define RPC_cublasHgemmStridedBatched_64 1267 +#define RPC_cublasSgemmStridedBatched 1268 +#define RPC_cublasSgemmStridedBatched_64 1269 +#define RPC_cublasDgemmStridedBatched 1270 +#define RPC_cublasDgemmStridedBatched_64 1271 +#define RPC_cublasCgemmStridedBatched 1272 +#define RPC_cublasCgemmStridedBatched_64 1273 +#define RPC_cublasCgemm3mStridedBatched 1274 +#define RPC_cublasCgemm3mStridedBatched_64 1275 +#define RPC_cublasZgemmStridedBatched 1276 +#define RPC_cublasZgemmStridedBatched_64 1277 +#define RPC_cublasGemmBatchedEx 1278 +#define RPC_cublasGemmBatchedEx_64 1279 +#define RPC_cublasSgeam 1280 +#define RPC_cublasSgeam_64 1281 +#define RPC_cublasDgeam 1282 +#define RPC_cublasDgeam_64 1283 +#define RPC_cublasCgeam 1284 +#define RPC_cublasCgeam_64 1285 +#define RPC_cublasZgeam 1286 +#define RPC_cublasZgeam_64 1287 +#define RPC_cublasStrsmBatched 1288 +#define RPC_cublasStrsmBatched_64 1289 +#define RPC_cublasDtrsmBatched 1290 +#define RPC_cublasDtrsmBatched_64 1291 +#define RPC_cublasCtrsmBatched 1292 +#define RPC_cublasCtrsmBatched_64 1293 +#define RPC_cublasZtrsmBatched 1294 +#define RPC_cublasZtrsmBatched_64 1295 +#define RPC_cublasSdgmm 1296 +#define RPC_cublasSdgmm_64 1297 +#define RPC_cublasDdgmm 1298 +#define RPC_cublasDdgmm_64 1299 +#define RPC_cublasCdgmm 1300 +#define RPC_cublasCdgmm_64 1301 +#define RPC_cublasZdgmm 1302 +#define RPC_cublasZdgmm_64 1303 +#define RPC_cublasSmatinvBatched 1304 +#define RPC_cublasDmatinvBatched 1305 +#define RPC_cublasCmatinvBatched 1306 +#define RPC_cublasZmatinvBatched 1307 +#define RPC_cublasSgeqrfBatched 1308 +#define RPC_cublasDgeqrfBatched 1309 +#define RPC_cublasCgeqrfBatched 1310 +#define RPC_cublasZgeqrfBatched 1311 +#define RPC_cublasSgelsBatched 1312 +#define RPC_cublasDgelsBatched 1313 +#define RPC_cublasCgelsBatched 1314 +#define RPC_cublasZgelsBatched 1315 +#define RPC_cublasStpttr 1316 +#define RPC_cublasDtpttr 1317 +#define RPC_cublasCtpttr 1318 +#define RPC_cublasZtpttr 1319 +#define RPC_cublasStrttp 1320 +#define RPC_cublasDtrttp 1321 +#define RPC_cublasCtrttp 1322 +#define RPC_cublasZtrttp 1323 +#define RPC_cublasSgetriBatched 1324 +#define RPC_cublasDgetriBatched 1325 +#define RPC_cublasCgetriBatched 1326 +#define RPC_cublasZgetriBatched 1327 +#define RPC_cublasSgetrsBatched 1328 +#define RPC_cublasDgetrsBatched 1329 +#define RPC_cublasCgetrsBatched 1330 +#define RPC_cublasZgetrsBatched 1331 +#define RPC_cublasUint8gemmBias 1332 +#define RPC_cublasMigrateComputeType 1333 +#define RPC_cudnnGetVersion 1334 +#define RPC_cudnnGetMaxDeviceVersion 1335 +#define RPC_cudnnGetCudartVersion 1336 +#define RPC_cudnnGetErrorString 1337 +#define RPC_cudnnGetLastErrorString 1338 +#define RPC_cudnnQueryRuntimeError 1339 +#define RPC_cudnnGetProperty 1340 +#define RPC_cudnnCreate 1341 +#define RPC_cudnnDestroy 1342 +#define RPC_cudnnSetStream 1343 +#define RPC_cudnnGetStream 1344 +#define RPC_cudnnGetCallback 1345 +#define RPC_cudnnGraphVersionCheck 1346 +#define RPC_cudnnBackendCreateDescriptor 1347 +#define RPC_cudnnBackendDestroyDescriptor 1348 +#define RPC_cudnnBackendInitialize 1349 +#define RPC_cudnnBackendFinalize 1350 +#define RPC_cudnnBackendSetAttribute 1351 +#define RPC_cudnnBackendExecute 1352 +#define RPC_cudnnBackendPopulateCudaGraph 1353 +#define RPC_cudnnBackendUpdateCudaGraph 1354 +#define RPC_cudnnCreateTensorDescriptor 1355 +#define RPC_cudnnSetTensor4dDescriptor 1356 +#define RPC_cudnnSetTensor4dDescriptorEx 1357 +#define RPC_cudnnGetTensor4dDescriptor 1358 +#define RPC_cudnnGetTensorSizeInBytes 1359 +#define RPC_cudnnDestroyTensorDescriptor 1360 +#define RPC_cudnnInitTransformDest 1361 +#define RPC_cudnnCreateTensorTransformDescriptor 1362 +#define RPC_cudnnDestroyTensorTransformDescriptor 1363 +#define RPC_cudnnCreateOpTensorDescriptor 1364 +#define RPC_cudnnSetOpTensorDescriptor 1365 +#define RPC_cudnnGetOpTensorDescriptor 1366 +#define RPC_cudnnDestroyOpTensorDescriptor 1367 +#define RPC_cudnnCreateReduceTensorDescriptor 1368 +#define RPC_cudnnSetReduceTensorDescriptor 1369 +#define RPC_cudnnGetReduceTensorDescriptor 1370 +#define RPC_cudnnDestroyReduceTensorDescriptor 1371 +#define RPC_cudnnGetReductionIndicesSize 1372 +#define RPC_cudnnGetReductionWorkspaceSize 1373 +#define RPC_cudnnCreateFilterDescriptor 1374 +#define RPC_cudnnSetFilter4dDescriptor 1375 +#define RPC_cudnnGetFilter4dDescriptor 1376 +#define RPC_cudnnGetFilterSizeInBytes 1377 +#define RPC_cudnnDestroyFilterDescriptor 1378 +#define RPC_cudnnCreatePoolingDescriptor 1379 +#define RPC_cudnnSetPooling2dDescriptor 1380 +#define RPC_cudnnGetPooling2dDescriptor 1381 +#define RPC_cudnnGetPooling2dForwardOutputDim 1382 +#define RPC_cudnnDestroyPoolingDescriptor 1383 +#define RPC_cudnnCreateActivationDescriptor 1384 +#define RPC_cudnnSetActivationDescriptor 1385 +#define RPC_cudnnGetActivationDescriptor 1386 +#define RPC_cudnnSetActivationDescriptorSwishBeta 1387 +#define RPC_cudnnGetActivationDescriptorSwishBeta 1388 +#define RPC_cudnnDestroyActivationDescriptor 1389 +#define RPC_cudnnActivationForward 1390 +#define RPC_cudnnCreateLRNDescriptor 1391 +#define RPC_cudnnSetLRNDescriptor 1392 +#define RPC_cudnnGetLRNDescriptor 1393 +#define RPC_cudnnDestroyLRNDescriptor 1394 +#define RPC_cudnnDeriveBNTensorDescriptor 1395 +#define RPC_cudnnDeriveNormTensorDescriptor 1396 +#define RPC_cudnnCreateSpatialTransformerDescriptor 1397 +#define RPC_cudnnDestroySpatialTransformerDescriptor 1398 +#define RPC_cudnnCreateDropoutDescriptor 1399 +#define RPC_cudnnDestroyDropoutDescriptor 1400 +#define RPC_cudnnDropoutGetStatesSize 1401 +#define RPC_cudnnDropoutGetReserveSpaceSize 1402 +#define RPC_cudnnGetDropoutDescriptor 1403 +#define RPC_cudnnOpsVersionCheck 1404 +#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1405 +#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1406 +#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1407 +#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1408 +#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1409 +#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1410 diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index b6324f9..b608b62 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -16173,12 +16173,12 @@ cublasStatus_t cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const float*)) < 0 || - rpc_write(0, Aarray, sizeof(const float* const[batchCount])) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, xarray, sizeof(const float* const[batchCount])) < 0 || + rpc_write(0, &xarray, sizeof(xarray)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, &beta, sizeof(const float*)) < 0 || - rpc_write(0, yarray, sizeof(float* const[batchCount])) < 0 || + rpc_write(0, &yarray, sizeof(yarray)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) @@ -16196,12 +16196,12 @@ cublasStatus_t cublasTSTgemvBatched(cublasHandle_t handle, cublasOperation_t tra rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const float*)) < 0 || - rpc_write(0, Aarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, xarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 || + rpc_write(0, &xarray, sizeof(xarray)) < 0 || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, &beta, sizeof(const float*)) < 0 || - rpc_write(0, yarray, sizeof(__nv_bfloat16* const[batchCount])) < 0 || + rpc_write(0, &yarray, sizeof(yarray)) < 0 || rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) @@ -18504,10 +18504,11 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c return return_value; } -cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, long long int strideA, const __half* B, int ldb, long long int strideB, const __half* beta, __half* C, int ldc, long long int strideC, int batchCount) +cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* const Aarray[], int lda, const __half* const Barray[], int ldb, const __half* beta, __half* const Carray[], int ldc, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasHgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasHgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18515,28 +18516,24 @@ cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const __half*)) < 0 || - rpc_write(0, &A, sizeof(const __half*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const __half*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const __half*)) < 0 || - rpc_write(0, C, sizeof(__half)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasHgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* A, int64_t lda, long long int strideA, const __half* B, int64_t ldb, long long int strideB, const __half* beta, __half* C, int64_t ldc, long long int strideC, int64_t batchCount) +cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* const Aarray[], int64_t lda, const __half* const Barray[], int64_t ldb, const __half* beta, __half* const Carray[], int64_t ldc, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasHgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasHgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18544,28 +18541,24 @@ cublasStatus_t cublasHgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const __half*)) < 0 || - rpc_write(0, &A, sizeof(const __half*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const __half*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const __half*)) < 0 || - rpc_write(0, C, sizeof(__half)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, long long int strideA, const float* B, int ldb, long long int strideB, const float* beta, float* C, int ldc, long long int strideC, int batchCount) +cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* const Aarray[], int lda, const float* const Barray[], int ldb, const float* beta, float* const Carray[], int ldc, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasSgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18573,28 +18566,24 @@ cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const float*)) < 0 || - rpc_write(0, &A, sizeof(const float*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const float*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const float*)) < 0 || - rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasSgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, long long int strideA, const float* B, int64_t ldb, long long int strideB, const float* beta, float* C, int64_t ldc, long long int strideC, int64_t batchCount) +cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* const Aarray[], int64_t lda, const float* const Barray[], int64_t ldb, const float* beta, float* const Carray[], int64_t ldc, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasSgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18602,28 +18591,24 @@ cublasStatus_t cublasSgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const float*)) < 0 || - rpc_write(0, &A, sizeof(const float*)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const float*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const float*)) < 0 || - rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* A, int lda, long long int strideA, const double* B, int ldb, long long int strideB, const double* beta, double* C, int ldc, long long int strideC, int batchCount) +cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18631,28 +18616,26 @@ cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const double*)) < 0 || - rpc_write(0, C, sizeof(double)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, long long int strideA, const double* B, int64_t ldb, long long int strideB, const double* beta, double* C, int64_t ldc, long long int strideC, int64_t batchCount) +cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* const Aarray[], int64_t lda, const double* const Barray[], int64_t ldb, const double* beta, double* const Carray[], int64_t ldc, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasDgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18660,28 +18643,26 @@ cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const double*)) < 0 || - rpc_write(0, &A, sizeof(const double*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const double*)) < 0 || - rpc_write(0, C, sizeof(double)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* B, int ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int ldc, long long int strideC, int batchCount) +cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasCgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18689,28 +18670,26 @@ cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || - rpc_write(0, C, sizeof(cuComplex)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* B, int64_t ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) +cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasCgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18718,28 +18697,26 @@ cublasStatus_t cublasCgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || - rpc_write(0, C, sizeof(cuComplex)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* B, int ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int ldc, long long int strideC, int batchCount) +cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasCgemm3mBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18747,28 +18724,26 @@ cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle, cublasOperatio rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || - rpc_write(0, C, sizeof(cuComplex)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgemm3mStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* B, int64_t ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) +cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasCgemm3mBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18776,28 +18751,26 @@ cublasStatus_t cublasCgemm3mStridedBatched_64(cublasHandle_t handle, cublasOpera rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || - rpc_write(0, C, sizeof(cuComplex)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, long long int strideA, const cuDoubleComplex* B, int ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc, long long int strideC, int batchCount) +cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const Barray[], int ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int ldc, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasZgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -18805,70 +18778,134 @@ cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int64_t lda, const cuDoubleComplex* const Barray[], int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int64_t ldc, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, long long int strideA, const __half* B, int ldb, long long int strideB, const __half* beta, __half* C, int ldc, long long int strideC, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasHgemmStridedBatched) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const __half*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) || + rpc_write(0, &A, sizeof(const __half*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &B, sizeof(const __half*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_write(0, &strideB, sizeof(long long int)) < 0 || - rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &beta, sizeof(const __half*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) || + rpc_write(0, C, sizeof(__half)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_write(0, &strideC, sizeof(long long int)) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* B, int64_t ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) +cublasStatus_t cublasHgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* A, int64_t lda, long long int strideA, const __half* B, int64_t ldb, long long int strideB, const __half* beta, __half* C, int64_t ldc, long long int strideC, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasHgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &alpha, sizeof(const __half*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) || + rpc_write(0, &A, sizeof(const __half*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &B, sizeof(const __half*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_write(0, &strideB, sizeof(long long int)) < 0 || - rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &beta, sizeof(const __half*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) || + rpc_write(0, C, sizeof(__half)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_write(0, &strideC, sizeof(long long int)) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) +cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, long long int strideA, const float* B, int ldb, long long int strideB, const float* beta, float* C, int ldc, long long int strideC, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSgeam) < 0 || + if (rpc_start_request(0, RPC_cublasSgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const float*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || rpc_write(0, &A, sizeof(const float*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const float*)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || rpc_write(0, &B, sizeof(const float*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const float*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || rpc_write(0, C, sizeof(float)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) @@ -18876,23 +18913,30 @@ cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cubl return return_value; } -cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* beta, const float* B, int64_t ldb, float* C, int64_t ldc) +cublasStatus_t cublasSgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, long long int strideA, const float* B, int64_t ldb, long long int strideB, const float* beta, float* C, int64_t ldc, long long int strideC, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSgeam_64) < 0 || + if (rpc_start_request(0, RPC_cublasSgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const float*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || rpc_write(0, &A, sizeof(const float*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const float*)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || rpc_write(0, &B, sizeof(const float*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const float*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || rpc_write(0, C, sizeof(float)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) @@ -18900,23 +18944,30 @@ cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, c return return_value; } -cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) +cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* A, int lda, long long int strideA, const double* B, int ldb, long long int strideB, const double* beta, double* C, int ldc, long long int strideC, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDgeam) < 0 || + if (rpc_start_request(0, RPC_cublasDgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const double*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || rpc_write(0, &A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || rpc_write(0, &B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const double*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) @@ -18924,23 +18975,30 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cubl return return_value; } -cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* beta, const double* B, int64_t ldb, double* C, int64_t ldc) +cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, long long int strideA, const double* B, int64_t ldb, long long int strideB, const double* beta, double* C, int64_t ldc, long long int strideC, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDgeam_64) < 0 || + if (rpc_start_request(0, RPC_cublasDgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const double*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || rpc_write(0, &A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double*)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || rpc_write(0, &B, sizeof(const double*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const double*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) @@ -18948,23 +19006,30 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, c return return_value; } -cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* beta, const cuComplex* B, int ldb, cuComplex* C, int ldc) +cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* B, int ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int ldc, long long int strideC, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgeam) < 0 || + if (rpc_start_request(0, RPC_cublasCgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || rpc_write(0, C, sizeof(cuComplex)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) @@ -18972,23 +19037,30 @@ cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, cubl return return_value; } -cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* beta, const cuComplex* B, int64_t ldb, cuComplex* C, int64_t ldc) +cublasStatus_t cublasCgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* B, int64_t ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgeam_64) < 0 || + if (rpc_start_request(0, RPC_cublasCgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || rpc_write(0, C, sizeof(cuComplex)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) @@ -18996,19 +19068,333 @@ cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, c return return_value; } -cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int ldb, cuDoubleComplex* C, int ldc) +cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* B, int ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int ldc, long long int strideC, int batchCount) { cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZgeam) < 0 || + if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgemm3mStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* B, int64_t ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, long long int strideA, const cuDoubleComplex* B, int ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc, long long int strideC, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgemmStridedBatched) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* B, int64_t ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgemmStridedBatched_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int64_t lda, const void* const Barray[], cudaDataType Btype, int64_t ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int64_t ldc, int64_t batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasGemmBatchedEx_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const void*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const void*)) < 0) || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &Atype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &Btype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const void*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const void*)) < 0) || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &Ctype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &computeType, sizeof(cublasComputeType_t)) < 0 || + rpc_write(0, &algo, sizeof(cublasGemmAlgo_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgeam) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(const float*)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const float*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || + rpc_write(0, &B, sizeof(const float*)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(float)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* beta, const float* B, int64_t ldb, float* C, int64_t ldc) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgeam_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const float*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(const float*)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const float*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || + rpc_write(0, &B, sizeof(const float*)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(float)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgeam) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const double*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const double*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || + rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(double)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(double)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* beta, const double* B, int64_t ldb, double* C, int64_t ldc) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgeam_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const double*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(const double*)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const double*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || + rpc_write(0, &B, sizeof(const double*)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(double)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(double)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* beta, const cuComplex* B, int ldb, cuComplex* C, int ldc) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgeam) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* beta, const cuComplex* B, int64_t ldb, cuComplex* C, int64_t ldc) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgeam_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, &B, sizeof(const cuComplex*)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int ldb, cuDoubleComplex* C, int ldc) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgeam) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || @@ -19030,15 +19416,209 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, c rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* const A[], int lda, float* const B[], int ldb, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasStrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* const A[], int64_t lda, float* const B[], int64_t ldb, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasStrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const float*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* const A[], int lda, double* const B[], int ldb, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const double*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* const A[], int64_t lda, double* const B[], int64_t ldb, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const double*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* const A[], int lda, cuComplex* const B[], int ldb, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* const A[], int64_t lda, cuComplex* const B[], int64_t ldb, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const B[], int ldb, int batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int64_t lda, cuDoubleComplex* const B[], int64_t ldb, int64_t batchCount) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(A)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const cuDoubleComplex*)) < 0 || - rpc_write(0, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_write(0, &B, sizeof(B)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; @@ -19055,6 +19635,7 @@ cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_write(0, &A, sizeof(const float*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &x, sizeof(const float*)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, C, sizeof(float)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || @@ -19076,6 +19657,7 @@ cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_write(0, &A, sizeof(const float*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &x, sizeof(const float*)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, C, sizeof(float)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || @@ -19097,6 +19679,7 @@ cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_write(0, &A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &x, sizeof(const double*)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || @@ -19118,6 +19701,7 @@ cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_write(0, &A, sizeof(const double*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &x, sizeof(const double*)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, C, sizeof(double)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || @@ -19139,6 +19723,7 @@ cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &x, sizeof(const cuComplex*)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, C, sizeof(cuComplex)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || @@ -19160,6 +19745,7 @@ cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &x, sizeof(const cuComplex*)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, C, sizeof(cuComplex)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || @@ -19181,6 +19767,7 @@ cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) || rpc_write(0, &incx, sizeof(int)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &ldc, sizeof(int)) < 0 || @@ -19202,6 +19789,7 @@ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || rpc_write(0, &x, sizeof(const cuDoubleComplex*)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) || rpc_write(0, &incx, sizeof(int64_t)) < 0 || rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &ldc, sizeof(int64_t)) < 0 || @@ -19212,6 +19800,254 @@ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 return return_value; } +cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* const A[], int lda, float* const Ainv[], int lda_inv, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(Ainv)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* const A[], int lda, double* const Ainv[], int lda_inv, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(Ainv)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, cuComplex* const Ainv[], int lda_inv, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(Ainv)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const Ainv[], int lda_inv, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(Ainv)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* const Aarray[], int lda, float* const TauArray[], int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(TauArray)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* const Aarray[], int lda, double* const TauArray[], int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(TauArray)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComplex* const Aarray[], int lda, cuComplex* const TauArray[], int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(TauArray)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const TauArray[], int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(TauArray)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, float* const Aarray[], int lda, float* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, double* const Aarray[], int lda, double* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuComplex* const Aarray[], int lda, cuComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(Carray)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda) { cublasStatus_t return_value; @@ -19220,6 +20056,7 @@ cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &AP, sizeof(const float*)) < 0 || + (AP != nullptr && rpc_write(0, AP, sizeof(const float)) < 0) || rpc_write(0, A, sizeof(float)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19237,6 +20074,7 @@ cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &AP, sizeof(const double*)) < 0 || + (AP != nullptr && rpc_write(0, AP, sizeof(const double)) < 0) || rpc_write(0, A, sizeof(double)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19254,6 +20092,7 @@ cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &AP, sizeof(const cuComplex*)) < 0 || + (AP != nullptr && rpc_write(0, AP, sizeof(const cuComplex)) < 0) || rpc_write(0, A, sizeof(cuComplex)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19271,6 +20110,7 @@ cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &AP, sizeof(const cuDoubleComplex*)) < 0 || + (AP != nullptr && rpc_write(0, AP, sizeof(const cuDoubleComplex)) < 0) || rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19288,6 +20128,7 @@ cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &A, sizeof(const float*)) < 0 || + (A != nullptr && rpc_write(0, A, sizeof(const float)) < 0) || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, AP, sizeof(float)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19305,6 +20146,7 @@ cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &A, sizeof(const double*)) < 0 || + (A != nullptr && rpc_write(0, A, sizeof(const double)) < 0) || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19322,6 +20164,7 @@ cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &A, sizeof(const cuComplex*)) < 0 || + (A != nullptr && rpc_write(0, A, sizeof(const cuComplex)) < 0) || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, AP, sizeof(cuComplex)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19339,6 +20182,7 @@ cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &A, sizeof(const cuDoubleComplex*)) < 0 || + (A != nullptr && rpc_write(0, A, sizeof(const cuDoubleComplex)) < 0) || rpc_write(0, &lda, sizeof(int)) < 0 || rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 || rpc_wait_for_response(0) < 0 || @@ -19348,6 +20192,181 @@ cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, return return_value; } +cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* const A[], int lda, const int* P, float* const C[], int ldc, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int*)) < 0 || + (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) || + rpc_write(0, &C, sizeof(C)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* const A[], int lda, const int* P, double* const C[], int ldc, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int*)) < 0 || + (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) || + rpc_write(0, &C, sizeof(C)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, const int* P, cuComplex* const C[], int ldc, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int*)) < 0 || + (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) || + rpc_write(0, &C, sizeof(C)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, const int* P, cuDoubleComplex* const C[], int ldc, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(A)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int*)) < 0 || + (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) || + rpc_write(0, &C, sizeof(C)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float* const Aarray[], int lda, const int* devIpiv, float* const Barray[], int ldb, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int*)) < 0 || + (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const double* const Aarray[], int lda, const int* devIpiv, double* const Barray[], int ldb, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int*)) < 0 || + (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuComplex* const Aarray[], int lda, const int* devIpiv, cuComplex* const Barray[], int ldb, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int*)) < 0 || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuDoubleComplex* const Aarray[], int lda, const int* devIpiv, cuDoubleComplex* const Barray[], int ldb, int* info, int batchSize) +{ + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(Aarray)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int*)) < 0 || + (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) || + rpc_write(0, &Barray, sizeof(Barray)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + cublasStatus_t cublasUint8gemmBias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc, int m, int n, int k, const unsigned char* A, int A_bias, int lda, const unsigned char* B, int B_bias, int ldb, unsigned char* C, int C_bias, int ldc, int C_mult, int C_shift) { cublasStatus_t return_value; @@ -21644,6 +22663,18 @@ std::unordered_map functionMap = { {"cublasCtrmm_v2_64", (void *)cublasCtrmm_v2_64}, {"cublasZtrmm_v2", (void *)cublasZtrmm_v2}, {"cublasZtrmm_v2_64", (void *)cublasZtrmm_v2_64}, + {"cublasHgemmBatched", (void *)cublasHgemmBatched}, + {"cublasHgemmBatched_64", (void *)cublasHgemmBatched_64}, + {"cublasSgemmBatched", (void *)cublasSgemmBatched}, + {"cublasSgemmBatched_64", (void *)cublasSgemmBatched_64}, + {"cublasDgemmBatched", (void *)cublasDgemmBatched}, + {"cublasDgemmBatched_64", (void *)cublasDgemmBatched_64}, + {"cublasCgemmBatched", (void *)cublasCgemmBatched}, + {"cublasCgemmBatched_64", (void *)cublasCgemmBatched_64}, + {"cublasCgemm3mBatched", (void *)cublasCgemm3mBatched}, + {"cublasCgemm3mBatched_64", (void *)cublasCgemm3mBatched_64}, + {"cublasZgemmBatched", (void *)cublasZgemmBatched}, + {"cublasZgemmBatched_64", (void *)cublasZgemmBatched_64}, {"cublasHgemmStridedBatched", (void *)cublasHgemmStridedBatched}, {"cublasHgemmStridedBatched_64", (void *)cublasHgemmStridedBatched_64}, {"cublasSgemmStridedBatched", (void *)cublasSgemmStridedBatched}, @@ -21656,6 +22687,7 @@ std::unordered_map functionMap = { {"cublasCgemm3mStridedBatched_64", (void *)cublasCgemm3mStridedBatched_64}, {"cublasZgemmStridedBatched", (void *)cublasZgemmStridedBatched}, {"cublasZgemmStridedBatched_64", (void *)cublasZgemmStridedBatched_64}, + {"cublasGemmBatchedEx_64", (void *)cublasGemmBatchedEx_64}, {"cublasSgeam", (void *)cublasSgeam}, {"cublasSgeam_64", (void *)cublasSgeam_64}, {"cublasDgeam", (void *)cublasDgeam}, @@ -21664,6 +22696,14 @@ std::unordered_map functionMap = { {"cublasCgeam_64", (void *)cublasCgeam_64}, {"cublasZgeam", (void *)cublasZgeam}, {"cublasZgeam_64", (void *)cublasZgeam_64}, + {"cublasStrsmBatched", (void *)cublasStrsmBatched}, + {"cublasStrsmBatched_64", (void *)cublasStrsmBatched_64}, + {"cublasDtrsmBatched", (void *)cublasDtrsmBatched}, + {"cublasDtrsmBatched_64", (void *)cublasDtrsmBatched_64}, + {"cublasCtrsmBatched", (void *)cublasCtrsmBatched}, + {"cublasCtrsmBatched_64", (void *)cublasCtrsmBatched_64}, + {"cublasZtrsmBatched", (void *)cublasZtrsmBatched}, + {"cublasZtrsmBatched_64", (void *)cublasZtrsmBatched_64}, {"cublasSdgmm", (void *)cublasSdgmm}, {"cublasSdgmm_64", (void *)cublasSdgmm_64}, {"cublasDdgmm", (void *)cublasDdgmm}, @@ -21672,6 +22712,18 @@ std::unordered_map functionMap = { {"cublasCdgmm_64", (void *)cublasCdgmm_64}, {"cublasZdgmm", (void *)cublasZdgmm}, {"cublasZdgmm_64", (void *)cublasZdgmm_64}, + {"cublasSmatinvBatched", (void *)cublasSmatinvBatched}, + {"cublasDmatinvBatched", (void *)cublasDmatinvBatched}, + {"cublasCmatinvBatched", (void *)cublasCmatinvBatched}, + {"cublasZmatinvBatched", (void *)cublasZmatinvBatched}, + {"cublasSgeqrfBatched", (void *)cublasSgeqrfBatched}, + {"cublasDgeqrfBatched", (void *)cublasDgeqrfBatched}, + {"cublasCgeqrfBatched", (void *)cublasCgeqrfBatched}, + {"cublasZgeqrfBatched", (void *)cublasZgeqrfBatched}, + {"cublasSgelsBatched", (void *)cublasSgelsBatched}, + {"cublasDgelsBatched", (void *)cublasDgelsBatched}, + {"cublasCgelsBatched", (void *)cublasCgelsBatched}, + {"cublasZgelsBatched", (void *)cublasZgelsBatched}, {"cublasStpttr", (void *)cublasStpttr}, {"cublasDtpttr", (void *)cublasDtpttr}, {"cublasCtpttr", (void *)cublasCtpttr}, @@ -21680,6 +22732,14 @@ std::unordered_map functionMap = { {"cublasDtrttp", (void *)cublasDtrttp}, {"cublasCtrttp", (void *)cublasCtrttp}, {"cublasZtrttp", (void *)cublasZtrttp}, + {"cublasSgetriBatched", (void *)cublasSgetriBatched}, + {"cublasDgetriBatched", (void *)cublasDgetriBatched}, + {"cublasCgetriBatched", (void *)cublasCgetriBatched}, + {"cublasZgetriBatched", (void *)cublasZgetriBatched}, + {"cublasSgetrsBatched", (void *)cublasSgetrsBatched}, + {"cublasDgetrsBatched", (void *)cublasDgetrsBatched}, + {"cublasCgetrsBatched", (void *)cublasCgetrsBatched}, + {"cublasZgetrsBatched", (void *)cublasZgetrsBatched}, {"cublasUint8gemmBias", (void *)cublasUint8gemmBias}, {"cudnnGetProperty", (void *)cudnnGetProperty}, {"cudnnCreate", (void *)cudnnCreate}, diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp index b243014..ab04e00 100644 --- a/codegen/gen_server.cpp +++ b/codegen/gen_server.cpp @@ -33924,15 +33924,13 @@ int handle_cublasSgemvBatched(void *conn) int m; int n; const float* alpha; + const float* * Aarray = nullptr; int lda; + const float* * xarray = nullptr; int incx; const float* beta; + float* * yarray = nullptr; int incy; - if (rpc_read(conn, &batchCount, sizeof(int)) < 0) - return -1; - const float* * Aarray = new const float* [batchCount]; - const float* * xarray = new const float* [batchCount]; - float* * yarray = new float* [batchCount]; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -33942,12 +33940,12 @@ int handle_cublasSgemvBatched(void *conn) rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || - rpc_read(conn, Aarray, sizeof(const float* const[batchCount])) < 0 || + rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, xarray, sizeof(const float* const[batchCount])) < 0 || + rpc_read(conn, &xarray, sizeof(const float* const)) < 0 || rpc_read(conn, &incx, sizeof(int)) < 0 || rpc_read(conn, &beta, sizeof(const float*)) < 0 || - rpc_read(conn, yarray, sizeof(float* const[batchCount])) < 0 || + rpc_read(conn, &yarray, sizeof(float* const)) < 0 || rpc_read(conn, &incy, sizeof(int)) < 0 || false) goto ERROR_0; @@ -33974,15 +33972,13 @@ int handle_cublasTSTgemvBatched(void *conn) int m; int n; const float* alpha; + const __nv_bfloat16* * Aarray = nullptr; int lda; + const __nv_bfloat16* * xarray = nullptr; int incx; const float* beta; + __nv_bfloat16* * yarray = nullptr; int incy; - if (rpc_read(conn, &batchCount, sizeof(int)) < 0) - return -1; - const __nv_bfloat16* * Aarray = new const __nv_bfloat16* [batchCount]; - const __nv_bfloat16* * xarray = new const __nv_bfloat16* [batchCount]; - __nv_bfloat16* * yarray = new __nv_bfloat16* [batchCount]; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -33992,12 +33988,12 @@ int handle_cublasTSTgemvBatched(void *conn) rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || - rpc_read(conn, Aarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 || + rpc_read(conn, &Aarray, sizeof(const __nv_bfloat16* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, xarray, sizeof(const __nv_bfloat16* const[batchCount])) < 0 || + rpc_read(conn, &xarray, sizeof(const __nv_bfloat16* const)) < 0 || rpc_read(conn, &incx, sizeof(int)) < 0 || rpc_read(conn, &beta, sizeof(const float*)) < 0 || - rpc_read(conn, yarray, sizeof(__nv_bfloat16* const[batchCount])) < 0 || + rpc_read(conn, &yarray, sizeof(__nv_bfloat16* const)) < 0 || rpc_read(conn, &incy, sizeof(int)) < 0 || false) goto ERROR_0; @@ -38701,8 +38697,9 @@ int handle_cublasZtrmm_v2_64(void *conn) return -1; } -int handle_cublasHgemmStridedBatched(void *conn) +int handle_cublasHgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -38710,20 +38707,17 @@ int handle_cublasHgemmStridedBatched(void *conn) int n; int k; const __half* alpha; - const __half* A; + const __half* * Aarray = nullptr; int lda; - long long int strideA; - const __half* B; + const __half* * Barray = nullptr; int ldb; - long long int strideB; const __half* beta; - __half C; + __half* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38731,27 +38725,22 @@ int handle_cublasHgemmStridedBatched(void *conn) rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const __half*)) < 0 || - rpc_read(conn, &A, sizeof(const __half*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const __half* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const __half*)) < 0 || + rpc_read(conn, &Barray, sizeof(const __half* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const __half*)) < 0 || - rpc_read(conn, &C, sizeof(__half)) < 0 || + rpc_read(conn, &Carray, sizeof(__half* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasHgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38760,8 +38749,9 @@ int handle_cublasHgemmStridedBatched(void *conn) return -1; } -int handle_cublasHgemmStridedBatched_64(void *conn) +int handle_cublasHgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -38769,20 +38759,17 @@ int handle_cublasHgemmStridedBatched_64(void *conn) int64_t n; int64_t k; const __half* alpha; - const __half* A; + const __half* * Aarray = nullptr; int64_t lda; - long long int strideA; - const __half* B; + const __half* * Barray = nullptr; int64_t ldb; - long long int strideB; const __half* beta; - __half C; + __half* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38790,27 +38777,22 @@ int handle_cublasHgemmStridedBatched_64(void *conn) rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const __half*)) < 0 || - rpc_read(conn, &A, sizeof(const __half*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const __half* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const __half*)) < 0 || + rpc_read(conn, &Barray, sizeof(const __half* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const __half*)) < 0 || - rpc_read(conn, &C, sizeof(__half)) < 0 || + rpc_read(conn, &Carray, sizeof(__half* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasHgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasHgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38819,8 +38801,9 @@ int handle_cublasHgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasSgemmStridedBatched(void *conn) +int handle_cublasSgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -38828,20 +38811,17 @@ int handle_cublasSgemmStridedBatched(void *conn) int n; int k; const float* alpha; - const float* A; + const float* * Aarray = nullptr; int lda; - long long int strideA; - const float* B; + const float* * Barray = nullptr; int ldb; - long long int strideB; const float* beta; - float C; + float* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38849,27 +38829,22 @@ int handle_cublasSgemmStridedBatched(void *conn) rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || - rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const float*)) < 0 || + rpc_read(conn, &Barray, sizeof(const float* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const float*)) < 0 || - rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &Carray, sizeof(float* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38878,8 +38853,9 @@ int handle_cublasSgemmStridedBatched(void *conn) return -1; } -int handle_cublasSgemmStridedBatched_64(void *conn) +int handle_cublasSgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -38887,20 +38863,17 @@ int handle_cublasSgemmStridedBatched_64(void *conn) int64_t n; int64_t k; const float* alpha; - const float* A; + const float* * Aarray = nullptr; int64_t lda; - long long int strideA; - const float* B; + const float* * Barray = nullptr; int64_t ldb; - long long int strideB; const float* beta; - float C; + float* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || @@ -38908,27 +38881,22 @@ int handle_cublasSgemmStridedBatched_64(void *conn) rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha, sizeof(const float*)) < 0 || - rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const float*)) < 0 || + rpc_read(conn, &Barray, sizeof(const float* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta, sizeof(const float*)) < 0 || - rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &Carray, sizeof(float* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasSgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38937,57 +38905,54 @@ int handle_cublasSgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasDgemmStridedBatched(void *conn) +int handle_cublasDgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; int k; - const double* alpha; - const double* A; + double* alpha_null_check; + double alpha; + const double* * Aarray = nullptr; int lda; - long long int strideA; - const double* B; + const double* * Barray = nullptr; int ldb; - long long int strideB; - const double* beta; - double C; + double* beta_null_check; + double beta; + double* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const double*)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const double*)) < 0 || + rpc_read(conn, &Barray, sizeof(const double* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const double*)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || + rpc_read(conn, &Carray, sizeof(double* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasDgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -38996,57 +38961,54 @@ int handle_cublasDgemmStridedBatched(void *conn) return -1; } -int handle_cublasDgemmStridedBatched_64(void *conn) +int handle_cublasDgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; int64_t k; - const double* alpha; - const double* A; + double* alpha_null_check; + double alpha; + const double* * Aarray = nullptr; int64_t lda; - long long int strideA; - const double* B; + const double* * Barray = nullptr; int64_t ldb; - long long int strideB; - const double* beta; - double C; + double* beta_null_check; + double beta; + double* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const double*)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const double*)) < 0 || + rpc_read(conn, &Barray, sizeof(const double* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const double*)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || + rpc_read(conn, &Carray, sizeof(double* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasDgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39055,57 +39017,54 @@ int handle_cublasDgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasCgemmStridedBatched(void *conn) +int handle_cublasCgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; int k; - const cuComplex* alpha; - const cuComplex* A; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* * Aarray = nullptr; int lda; - long long int strideA; - const cuComplex* B; + const cuComplex* * Barray = nullptr; int ldb; - long long int strideB; - const cuComplex* beta; - cuComplex C; + cuComplex* beta_null_check; + cuComplex beta; + cuComplex* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasCgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39114,57 +39073,54 @@ int handle_cublasCgemmStridedBatched(void *conn) return -1; } -int handle_cublasCgemmStridedBatched_64(void *conn) +int handle_cublasCgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; int64_t k; - const cuComplex* alpha; - const cuComplex* A; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* * Aarray = nullptr; int64_t lda; - long long int strideA; - const cuComplex* B; + const cuComplex* * Barray = nullptr; int64_t ldb; - long long int strideB; - const cuComplex* beta; - cuComplex C; + cuComplex* beta_null_check; + cuComplex beta; + cuComplex* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasCgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39173,57 +39129,54 @@ int handle_cublasCgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasCgemm3mStridedBatched(void *conn) +int handle_cublasCgemm3mBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; int k; - const cuComplex* alpha; - const cuComplex* A; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* * Aarray = nullptr; int lda; - long long int strideA; - const cuComplex* B; + const cuComplex* * Barray = nullptr; int ldb; - long long int strideB; - const cuComplex* beta; - cuComplex C; + cuComplex* beta_null_check; + cuComplex beta; + cuComplex* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemm3mStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasCgemm3mBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39232,57 +39185,54 @@ int handle_cublasCgemm3mStridedBatched(void *conn) return -1; } -int handle_cublasCgemm3mStridedBatched_64(void *conn) +int handle_cublasCgemm3mBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; int64_t k; - const cuComplex* alpha; - const cuComplex* A; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* * Aarray = nullptr; int64_t lda; - long long int strideA; - const cuComplex* B; + const cuComplex* * Barray = nullptr; int64_t ldb; - long long int strideB; - const cuComplex* beta; - cuComplex C; + cuComplex* beta_null_check; + cuComplex beta; + cuComplex* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemm3mStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasCgemm3mBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39291,57 +39241,54 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) return -1; } -int handle_cublasZgemmStridedBatched(void *conn) +int handle_cublasZgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; int k; - const cuDoubleComplex* alpha; - const cuDoubleComplex* A; + cuDoubleComplex* alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex* * Aarray = nullptr; int lda; - long long int strideA; - const cuDoubleComplex* B; + const cuDoubleComplex* * Barray = nullptr; int ldb; - long long int strideB; - const cuDoubleComplex* beta; - cuDoubleComplex C; + cuDoubleComplex* beta_null_check; + cuDoubleComplex beta; + cuDoubleComplex* * Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuDoubleComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasZgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39350,57 +39297,54 @@ int handle_cublasZgemmStridedBatched(void *conn) return -1; } -int handle_cublasZgemmStridedBatched_64(void *conn) +int handle_cublasZgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; int64_t k; - const cuDoubleComplex* alpha; - const cuDoubleComplex* A; + cuDoubleComplex* alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex* * Aarray = nullptr; int64_t lda; - long long int strideA; - const cuDoubleComplex* B; + const cuDoubleComplex* * Barray = nullptr; int64_t ldb; - long long int strideB; - const cuDoubleComplex* beta; - cuDoubleComplex C; + cuDoubleComplex* beta_null_check; + cuDoubleComplex beta; + cuDoubleComplex* * Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuDoubleComplex* const)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgemmStridedBatched_64(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = cublasZgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39409,21 +39353,154 @@ int handle_cublasZgemmStridedBatched_64(void *conn) return -1; } -int handle_cublasSgeam(void *conn) +int handle_cublasHgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; - const float* alpha; - const float* A; + int k; + __half* alpha_null_check; + __half alpha; + const __half* A; int lda; - const float* beta; - const float* B; + long long int strideA; + const __half* B; + int ldb; + long long int strideB; + __half* beta_null_check; + __half beta; + __half C; + int ldc; + long long int strideC; + int batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const __half*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) || + rpc_read(conn, &A, sizeof(const __half*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const __half*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const __half*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) || + rpc_read(conn, &C, sizeof(__half)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasHgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(__half)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasHgemmStridedBatched_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + __half* alpha_null_check; + __half alpha; + const __half* A; + int64_t lda; + long long int strideA; + const __half* B; + int64_t ldb; + long long int strideB; + __half* beta_null_check; + __half beta; + __half C; + int64_t ldc; + long long int strideC; + int64_t batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const __half*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) || + rpc_read(conn, &A, sizeof(const __half*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const __half*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const __half*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) || + rpc_read(conn, &C, sizeof(__half)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasHgemmStridedBatched_64(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(__half)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgemmStridedBatched(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + int k; + float* alpha_null_check; + float alpha; + const float* A; + int lda; + long long int strideA; + const float* B; int ldb; + long long int strideB; + float* beta_null_check; + float beta; float C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39432,21 +39509,28 @@ int handle_cublasSgeam(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const float*)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta, sizeof(const float*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const float*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const float*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || rpc_read(conn, &C, sizeof(float)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasSgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(float)) < 0 || @@ -39458,21 +39542,28 @@ int handle_cublasSgeam(void *conn) return -1; } -int handle_cublasSgeam_64(void *conn) +int handle_cublasSgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; - const float* alpha; + int64_t k; + float* alpha_null_check; + float alpha; const float* A; int64_t lda; - const float* beta; + long long int strideA; const float* B; int64_t ldb; + long long int strideB; + float* beta_null_check; + float beta; float C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39481,21 +39572,28 @@ int handle_cublasSgeam_64(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const float*)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta, sizeof(const float*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const float*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const float*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || rpc_read(conn, &C, sizeof(float)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasSgemmStridedBatched_64(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(float)) < 0 || @@ -39507,21 +39605,28 @@ int handle_cublasSgeam_64(void *conn) return -1; } -int handle_cublasDgeam(void *conn) +int handle_cublasDgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; - const double* alpha; + int k; + double* alpha_null_check; + double alpha; const double* A; int lda; - const double* beta; + long long int strideA; const double* B; int ldb; + long long int strideB; + double* beta_null_check; + double beta; double C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39530,21 +39635,28 @@ int handle_cublasDgeam(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const double*)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || rpc_read(conn, &A, sizeof(const double*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta, sizeof(const double*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const double*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || rpc_read(conn, &C, sizeof(double)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasDgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -39556,21 +39668,28 @@ int handle_cublasDgeam(void *conn) return -1; } -int handle_cublasDgeam_64(void *conn) +int handle_cublasDgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; - const double* alpha; + int64_t k; + double* alpha_null_check; + double alpha; const double* A; int64_t lda; - const double* beta; + long long int strideA; const double* B; int64_t ldb; + long long int strideB; + double* beta_null_check; + double beta; double C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39579,21 +39698,28 @@ int handle_cublasDgeam_64(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const double*)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || rpc_read(conn, &A, sizeof(const double*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta, sizeof(const double*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const double*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || rpc_read(conn, &C, sizeof(double)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasDgemmStridedBatched_64(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -39605,21 +39731,28 @@ int handle_cublasDgeam_64(void *conn) return -1; } -int handle_cublasCgeam(void *conn) +int handle_cublasCgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; - const cuComplex* alpha; + int k; + cuComplex* alpha_null_check; + cuComplex alpha; const cuComplex* A; int lda; - const cuComplex* beta; + long long int strideA; const cuComplex* B; int ldb; + long long int strideB; + cuComplex* beta_null_check; + cuComplex beta; cuComplex C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39628,21 +39761,28 @@ int handle_cublasCgeam(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || rpc_read(conn, &C, sizeof(cuComplex)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasCgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuComplex)) < 0 || @@ -39654,21 +39794,28 @@ int handle_cublasCgeam(void *conn) return -1; } -int handle_cublasCgeam_64(void *conn) +int handle_cublasCgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; - const cuComplex* alpha; + int64_t k; + cuComplex* alpha_null_check; + cuComplex alpha; const cuComplex* A; int64_t lda; - const cuComplex* beta; + long long int strideA; const cuComplex* B; int64_t ldb; + long long int strideB; + cuComplex* beta_null_check; + cuComplex beta; cuComplex C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39677,21 +39824,28 @@ int handle_cublasCgeam_64(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || rpc_read(conn, &C, sizeof(cuComplex)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasCgemmStridedBatched_64(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuComplex)) < 0 || @@ -39703,21 +39857,28 @@ int handle_cublasCgeam_64(void *conn) return -1; } -int handle_cublasZgeam(void *conn) +int handle_cublasCgemm3mStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; - const cuDoubleComplex* alpha; - const cuDoubleComplex* A; + int k; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* A; int lda; - const cuDoubleComplex* beta; - const cuDoubleComplex* B; + long long int strideA; + const cuComplex* B; int ldb; - cuDoubleComplex C; + long long int strideB; + cuComplex* beta_null_check; + cuComplex beta; + cuComplex C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39726,24 +39887,31 @@ int handle_cublasZgeam(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasCgemm3mStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39752,21 +39920,154 @@ int handle_cublasZgeam(void *conn) return -1; } -int handle_cublasZgeam_64(void *conn) +int handle_cublasCgemm3mStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; - const cuDoubleComplex* alpha; - const cuDoubleComplex* A; + int64_t k; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* A; int64_t lda; - const cuDoubleComplex* beta; - const cuDoubleComplex* B; + long long int strideA; + const cuComplex* B; int64_t ldb; + long long int strideB; + cuComplex* beta_null_check; + cuComplex beta; + cuComplex C; + int64_t ldc; + long long int strideC; + int64_t batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgemm3mStridedBatched_64(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgemmStridedBatched(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + int k; + cuDoubleComplex* alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex* A; + int lda; + long long int strideA; + const cuDoubleComplex* B; + int ldb; + long long int strideB; + cuDoubleComplex* beta_null_check; + cuDoubleComplex beta; + cuDoubleComplex C; + int ldc; + long long int strideC; + int batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgemmStridedBatched(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgemmStridedBatched_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + cuDoubleComplex* alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex* A; + int64_t lda; + long long int strideA; + const cuDoubleComplex* B; + int64_t ldb; + long long int strideB; + cuDoubleComplex* beta_null_check; + cuDoubleComplex beta; cuDoubleComplex C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if ( @@ -39775,21 +40076,28 @@ int handle_cublasZgeam_64(void *conn) rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasZgemmStridedBatched_64(handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || @@ -39801,29 +40109,105 @@ int handle_cublasZgeam_64(void *conn) return -1; } -int handle_cublasSdgmm(void *conn) +int handle_cublasGemmBatchedEx_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; - cublasSideMode_t mode; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + void* alpha_null_check; + void* alpha; + const void* * Aarray = nullptr; + cudaDataType Atype; + int64_t lda; + const void* * Barray = nullptr; + cudaDataType Btype; + int64_t ldb; + void* beta_null_check; + void* beta; + void* * Carray = nullptr; + cudaDataType Ctype; + int64_t ldc; + cublasComputeType_t computeType; + cublasGemmAlgo_t algo; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const void*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const void*)) < 0) || + rpc_read(conn, &Aarray, sizeof(const void* const)) < 0 || + rpc_read(conn, &Atype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &Barray, sizeof(const void* const)) < 0 || + rpc_read(conn, &Btype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const void*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const void*)) < 0) || + rpc_read(conn, &Carray, sizeof(void* const)) < 0 || + rpc_read(conn, &Ctype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &computeType, sizeof(cublasComputeType_t)) < 0 || + rpc_read(conn, &algo, sizeof(cublasGemmAlgo_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasGemmBatchedEx_64(handle, transa, transb, m, n, k, &alpha, Aarray, Atype, lda, Barray, Btype, ldb, &beta, Carray, Ctype, ldc, batchCount, computeType, algo); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgeam(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; int m; int n; + float* alpha_null_check; + float alpha; const float* A; int lda; - const float* x; - int incx; + float* beta_null_check; + float beta; + const float* B; + int ldb; float C; int ldc; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x, sizeof(const float*)) < 0 || - rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const float*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || + rpc_read(conn, &B, sizeof(const float*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || rpc_read(conn, &C, sizeof(float)) < 0 || rpc_read(conn, &ldc, sizeof(int)) < 0 || false) @@ -39832,7 +40216,7 @@ int handle_cublasSdgmm(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, &alpha, A, lda, &beta, B, ldb, &C, ldc); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(float)) < 0 || @@ -39844,29 +40228,39 @@ int handle_cublasSdgmm(void *conn) return -1; } -int handle_cublasSdgmm_64(void *conn) +int handle_cublasSgeam_64(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; + cublasOperation_t transa; + cublasOperation_t transb; int64_t m; int64_t n; + float* alpha_null_check; + float alpha; const float* A; int64_t lda; - const float* x; - int64_t incx; + float* beta_null_check; + float beta; + const float* B; + int64_t ldb; float C; int64_t ldc; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || rpc_read(conn, &A, sizeof(const float*)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x, sizeof(const float*)) < 0 || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const float*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || + rpc_read(conn, &B, sizeof(const float*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || rpc_read(conn, &C, sizeof(float)) < 0 || rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) @@ -39875,10 +40269,1666 @@ int handle_cublasSdgmm_64(void *conn) request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, &alpha, A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgeam(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + double* alpha_null_check; + double alpha; + const double* A; + int lda; + double* beta_null_check; + double beta; + const double* B; + int ldb; + double C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || + rpc_read(conn, &B, sizeof(const double*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, &alpha, A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgeam_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + double* alpha_null_check; + double alpha; + const double* A; + int64_t lda; + double* beta_null_check; + double beta; + const double* B; + int64_t ldb; + double C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || + rpc_read(conn, &B, sizeof(const double*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, &alpha, A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgeam(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* A; + int lda; + cuComplex* beta_null_check; + cuComplex beta; + const cuComplex* B; + int ldb; + cuComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, &alpha, A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgeam_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* A; + int64_t lda; + cuComplex* beta_null_check; + cuComplex beta; + const cuComplex* B; + int64_t ldb; + cuComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &B, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, &alpha, A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgeam(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + cuDoubleComplex* alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex* A; + int lda; + cuDoubleComplex* beta_null_check; + cuDoubleComplex beta; + const cuDoubleComplex* B; + int ldb; + cuDoubleComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, &alpha, A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgeam_64(void *conn) +{ + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + cuDoubleComplex* alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex* A; + int64_t lda; + cuDoubleComplex* beta_null_check; + cuDoubleComplex beta; + const cuDoubleComplex* B; + int64_t ldb; + cuDoubleComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &B, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, &alpha, A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasStrsmBatched(void *conn) +{ + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + float* alpha_null_check; + float alpha; + const float* * A = nullptr; + int lda; + float* * B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(const float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(float* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasStrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasStrsmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + float* alpha_null_check; + float alpha; + const float* * A = nullptr; + int64_t lda; + float* * B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(const float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(float* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasStrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDtrsmBatched(void *conn) +{ + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + double* alpha_null_check; + double alpha; + const double* * A = nullptr; + int lda; + double* * B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(const double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(double* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDtrsmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + double* alpha_null_check; + double alpha; + const double* * A = nullptr; + int64_t lda; + double* * B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(const double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(double* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCtrsmBatched(void *conn) +{ + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* * A = nullptr; + int lda; + cuComplex* * B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCtrsmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + cuComplex* alpha_null_check; + cuComplex alpha; + const cuComplex* * A = nullptr; + int64_t lda; + cuComplex* * B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZtrsmBatched(void *conn) +{ + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + cuDoubleComplex* alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex* * A = nullptr; + int lda; + cuDoubleComplex* * B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZtrsmBatched_64(void *conn) +{ + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + cuDoubleComplex* alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex* * A = nullptr; + int64_t lda; + cuDoubleComplex* * B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSdgmm(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const float* A; + int lda; + float* x_null_check; + float x; + int incx; + float C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const float*)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSdgmm_64(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const float* A; + int64_t lda; + float* x_null_check; + float x; + int64_t incx; + float C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const float*)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDdgmm(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const double* A; + int lda; + double* x_null_check; + double x; + int incx; + double C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const double*)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDdgmm_64(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const double* A; + int64_t lda; + double* x_null_check; + double x; + int64_t incx; + double C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const double*)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCdgmm(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const cuComplex* A; + int lda; + cuComplex* x_null_check; + cuComplex x; + int incx; + cuComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const cuComplex*)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCdgmm_64(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const cuComplex* A; + int64_t lda; + cuComplex* x_null_check; + cuComplex x; + int64_t incx; + cuComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const cuComplex*)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZdgmm(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const cuDoubleComplex* A; + int lda; + cuDoubleComplex* x_null_check; + cuDoubleComplex x; + int incx; + cuDoubleComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZdgmm_64(void *conn) +{ + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const cuDoubleComplex* A; + int64_t lda; + cuDoubleComplex* x_null_check; + cuDoubleComplex x; + int64_t incx; + cuDoubleComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSmatinvBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int n; + const float* * A = nullptr; + int lda; + float* * Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(float* const)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDmatinvBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int n; + const double* * A = nullptr; + int lda; + double* * Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(double* const)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCmatinvBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int n; + const cuComplex* * A = nullptr; + int lda; + cuComplex* * Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZmatinvBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int n; + const cuDoubleComplex* * A = nullptr; + int lda; + cuDoubleComplex* * Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgeqrfBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int m; + int n; + float* * Aarray = nullptr; + int lda; + float* * TauArray = nullptr; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(float* const)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgeqrfBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int m; + int n; + double* * Aarray = nullptr; + int lda; + double* * TauArray = nullptr; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(double* const)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgeqrfBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int m; + int n; + cuComplex* * Aarray = nullptr; + int lda; + cuComplex* * TauArray = nullptr; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgeqrfBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + int m; + int n; + cuDoubleComplex* * Aarray = nullptr; + int lda; + cuDoubleComplex* * TauArray = nullptr; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgeqrfBatched(handle, m, n, Aarray, lda, TauArray, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgelsBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + int nrhs; + float* * Aarray = nullptr; + int lda; + float* * Carray = nullptr; + int ldc; + int info; + int devInfoArray; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(float* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(float* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgelsBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + int nrhs; + double* * Aarray = nullptr; + int lda; + double* * Carray = nullptr; + int ldc; + int info; + int devInfoArray; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(double* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(double* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgelsBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + int nrhs; + cuComplex* * Aarray = nullptr; + int lda; + cuComplex* * Carray = nullptr; + int ldc; + int info; + int devInfoArray; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgelsBatched(void *conn) +{ + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + int nrhs; + cuDoubleComplex* * Aarray = nullptr; + int lda; + cuDoubleComplex* * Carray = nullptr; + int ldc; + int info; + int devInfoArray; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, &info, &devInfoArray, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasStpttr(void *conn) +{ + cublasHandle_t handle; + cublasFillMode_t uplo; + int n; + float* AP_null_check; + float AP; + float A; + int lda; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &AP_null_check, sizeof(const float*)) < 0 || + (AP_null_check && rpc_read(conn, &AP, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(float)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasStpttr(handle, uplo, n, &AP, &A, lda); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &A, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDtpttr(void *conn) +{ + cublasHandle_t handle; + cublasFillMode_t uplo; + int n; + double* AP_null_check; + double AP; + double A; + int lda; + int request_id; + cublasStatus_t scuda_intercept_result; + if ( + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &AP_null_check, sizeof(const double*)) < 0 || + (AP_null_check && rpc_read(conn, &AP, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(double)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDtpttr(handle, uplo, n, &AP, &A, lda); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_write(conn, &A, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39887,41 +41937,35 @@ int handle_cublasSdgmm_64(void *conn) return -1; } -int handle_cublasDdgmm(void *conn) +int handle_cublasCtpttr(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int m; + cublasFillMode_t uplo; int n; - const double* A; + cuComplex* AP_null_check; + cuComplex AP; + cuComplex A; int lda; - const double* x; - int incx; - double C; - int ldc; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &AP_null_check, sizeof(const cuComplex*)) < 0 || + (AP_null_check && rpc_read(conn, &AP, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(cuComplex)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x, sizeof(const double*)) < 0 || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasCtpttr(handle, uplo, n, &AP, &A, lda); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_write(conn, &A, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39930,41 +41974,35 @@ int handle_cublasDdgmm(void *conn) return -1; } -int handle_cublasDdgmm_64(void *conn) +int handle_cublasZtpttr(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const double* A; - int64_t lda; - const double* x; - int64_t incx; - double C; - int64_t ldc; + cublasFillMode_t uplo; + int n; + cuDoubleComplex* AP_null_check; + cuDoubleComplex AP; + cuDoubleComplex A; + int lda; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x, sizeof(const double*)) < 0 || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &AP_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (AP_null_check && rpc_read(conn, &AP, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasZtpttr(handle, uplo, n, &AP, &A, lda); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -39973,41 +42011,35 @@ int handle_cublasDdgmm_64(void *conn) return -1; } -int handle_cublasCdgmm(void *conn) +int handle_cublasStrttp(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int m; + cublasFillMode_t uplo; int n; - const cuComplex* A; + float* A_null_check; + float A; int lda; - const cuComplex* x; - int incx; - cuComplex C; - int ldc; + float AP; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &A_null_check, sizeof(const float*)) < 0 || + (A_null_check && rpc_read(conn, &A, sizeof(const float)) < 0) || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(float)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasStrttp(handle, uplo, n, &A, lda, &AP); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_write(conn, &AP, sizeof(float)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40016,41 +42048,35 @@ int handle_cublasCdgmm(void *conn) return -1; } -int handle_cublasCdgmm_64(void *conn) +int handle_cublasDtrttp(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const cuComplex* A; - int64_t lda; - const cuComplex* x; - int64_t incx; - cuComplex C; - int64_t ldc; + cublasFillMode_t uplo; + int n; + double* A_null_check; + double A; + int lda; + double AP; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A_null_check, sizeof(const double*)) < 0 || + (A_null_check && rpc_read(conn, &A, sizeof(const double)) < 0) || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(double)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasDtrttp(handle, uplo, n, &A, lda, &AP); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_write(conn, &AP, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40059,41 +42085,35 @@ int handle_cublasCdgmm_64(void *conn) return -1; } -int handle_cublasZdgmm(void *conn) +int handle_cublasCtrttp(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int m; + cublasFillMode_t uplo; int n; - const cuDoubleComplex* A; + cuComplex* A_null_check; + cuComplex A; int lda; - const cuDoubleComplex* x; - int incx; - cuDoubleComplex C; - int ldc; + cuComplex AP; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &A_null_check, sizeof(const cuComplex*)) < 0 || + (A_null_check && rpc_read(conn, &A, sizeof(const cuComplex)) < 0) || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(cuComplex)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZdgmm(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasCtrttp(handle, uplo, n, &A, lda, &AP); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &AP, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40102,41 +42122,35 @@ int handle_cublasZdgmm(void *conn) return -1; } -int handle_cublasZdgmm_64(void *conn) +int handle_cublasZtrttp(void *conn) { cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const cuDoubleComplex* A; - int64_t lda; - const cuDoubleComplex* x; - int64_t incx; - cuDoubleComplex C; - int64_t ldc; + cublasFillMode_t uplo; + int n; + cuDoubleComplex* A_null_check; + cuDoubleComplex A; + int lda; + cuDoubleComplex AP; int request_id; cublasStatus_t scuda_intercept_result; if ( rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A_null_check, sizeof(const cuDoubleComplex*)) < 0 || + (A_null_check && rpc_read(conn, &A, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &AP, sizeof(cuDoubleComplex)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZdgmm_64(handle, mode, m, n, A, lda, x, incx, &C, ldc); + scuda_intercept_result = cublasZtrttp(handle, uplo, n, &A, lda, &AP); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40145,33 +42159,41 @@ int handle_cublasZdgmm_64(void *conn) return -1; } -int handle_cublasStpttr(void *conn) +int handle_cublasSgetriBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; int n; - const float* AP; - float A; + const float* * A = nullptr; int lda; + int* P_null_check; + int P; + float* * C = nullptr; + int ldc; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(const float*)) < 0 || - rpc_read(conn, &A, sizeof(float)) < 0 || + rpc_read(conn, &A, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P_null_check, sizeof(const int*)) < 0 || + (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) || + rpc_read(conn, &C, sizeof(float* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasStpttr(handle, uplo, n, AP, &A, lda); + scuda_intercept_result = cublasSgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &A, sizeof(float)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40180,33 +42202,41 @@ int handle_cublasStpttr(void *conn) return -1; } -int handle_cublasDtpttr(void *conn) +int handle_cublasDgetriBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; int n; - const double* AP; - double A; + const double* * A = nullptr; int lda; + int* P_null_check; + int P; + double* * C = nullptr; + int ldc; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(const double*)) < 0 || - rpc_read(conn, &A, sizeof(double)) < 0 || + rpc_read(conn, &A, sizeof(const double* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P_null_check, sizeof(const int*)) < 0 || + (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) || + rpc_read(conn, &C, sizeof(double* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtpttr(handle, uplo, n, AP, &A, lda); + scuda_intercept_result = cublasDgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &A, sizeof(double)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40215,33 +42245,41 @@ int handle_cublasDtpttr(void *conn) return -1; } -int handle_cublasCtpttr(void *conn) +int handle_cublasCgetriBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; int n; - const cuComplex* AP; - cuComplex A; + const cuComplex* * A = nullptr; int lda; + int* P_null_check; + int P; + cuComplex* * C = nullptr; + int ldc; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(const cuComplex*)) < 0 || - rpc_read(conn, &A, sizeof(cuComplex)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P_null_check, sizeof(const int*)) < 0 || + (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) || + rpc_read(conn, &C, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCtpttr(handle, uplo, n, AP, &A, lda); + scuda_intercept_result = cublasCgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &A, sizeof(cuComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40250,33 +42288,41 @@ int handle_cublasCtpttr(void *conn) return -1; } -int handle_cublasZtpttr(void *conn) +int handle_cublasZgetriBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; int n; - const cuDoubleComplex* AP; - cuDoubleComplex A; + const cuDoubleComplex* * A = nullptr; int lda; + int* P_null_check; + int P; + cuDoubleComplex* * C = nullptr; + int ldc; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(const cuDoubleComplex*)) < 0 || - rpc_read(conn, &A, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P_null_check, sizeof(const int*)) < 0 || + (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) || + rpc_read(conn, &C, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZtpttr(handle, uplo, n, AP, &A, lda); + scuda_intercept_result = cublasZgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &A, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40285,33 +42331,45 @@ int handle_cublasZtpttr(void *conn) return -1; } -int handle_cublasStrttp(void *conn) +int handle_cublasSgetrsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; + cublasOperation_t trans; int n; - const float* A; + int nrhs; + const float* * Aarray = nullptr; int lda; - float AP; + int* devIpiv_null_check; + int devIpiv; + float* * Barray = nullptr; + int ldb; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const float*)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(float)) < 0 || + rpc_read(conn, &devIpiv_null_check, sizeof(const int*)) < 0 || + (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) || + rpc_read(conn, &Barray, sizeof(float* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasStrttp(handle, uplo, n, A, lda, &AP); + scuda_intercept_result = cublasSgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray, ldb, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &AP, sizeof(float)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40320,33 +42378,45 @@ int handle_cublasStrttp(void *conn) return -1; } -int handle_cublasDtrttp(void *conn) +int handle_cublasDgetrsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; + cublasOperation_t trans; int n; - const double* A; + int nrhs; + const double* * Aarray = nullptr; int lda; - double AP; + int* devIpiv_null_check; + int devIpiv; + double* * Barray = nullptr; + int ldb; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const double*)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const double* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(double)) < 0 || + rpc_read(conn, &devIpiv_null_check, sizeof(const int*)) < 0 || + (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) || + rpc_read(conn, &Barray, sizeof(double* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDtrttp(handle, uplo, n, A, lda, &AP); + scuda_intercept_result = cublasDgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray, ldb, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &AP, sizeof(double)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40355,33 +42425,43 @@ int handle_cublasDtrttp(void *conn) return -1; } -int handle_cublasCtrttp(void *conn) +int handle_cublasCgetrsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; + cublasOperation_t trans; int n; - const cuComplex* A; + int nrhs; + const cuComplex* * Aarray = nullptr; int lda; - cuComplex AP; + const int* devIpiv; + cuComplex* * Barray = nullptr; + int ldb; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex*)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(cuComplex)) < 0 || + rpc_read(conn, &devIpiv, sizeof(const int*)) < 0 || + rpc_read(conn, &Barray, sizeof(cuComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCtrttp(handle, uplo, n, A, lda, &AP); + scuda_intercept_result = cublasCgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &AP, sizeof(cuComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -40390,33 +42470,45 @@ int handle_cublasCtrttp(void *conn) return -1; } -int handle_cublasZtrttp(void *conn) +int handle_cublasZgetrsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasFillMode_t uplo; + cublasOperation_t trans; int n; - const cuDoubleComplex* A; + int nrhs; + const cuDoubleComplex* * Aarray = nullptr; int lda; - cuDoubleComplex AP; + int* devIpiv_null_check; + int devIpiv; + cuDoubleComplex* * Barray = nullptr; + int ldb; + int info; int request_id; cublasStatus_t scuda_intercept_result; if ( + rpc_read(conn, &batchSize, sizeof(int)) < 0 || rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex*)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex* const)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &AP, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &devIpiv_null_check, sizeof(const int*)) < 0 || + (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) || + rpc_read(conn, &Barray, sizeof(cuDoubleComplex* const)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZtrttp(handle, uplo, n, A, lda, &AP); + scuda_intercept_result = cublasZgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray, ldb, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &AP, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -43904,6 +45996,18 @@ static RequestHandler opHandlers[] = { handle_cublasCtrmm_v2_64, handle_cublasZtrmm_v2, handle_cublasZtrmm_v2_64, + handle_cublasHgemmBatched, + handle_cublasHgemmBatched_64, + handle_cublasSgemmBatched, + handle_cublasSgemmBatched_64, + handle_cublasDgemmBatched, + handle_cublasDgemmBatched_64, + handle_cublasCgemmBatched, + handle_cublasCgemmBatched_64, + handle_cublasCgemm3mBatched, + handle_cublasCgemm3mBatched_64, + handle_cublasZgemmBatched, + handle_cublasZgemmBatched_64, handle_cublasHgemmStridedBatched, handle_cublasHgemmStridedBatched_64, handle_cublasSgemmStridedBatched, @@ -43916,6 +46020,8 @@ static RequestHandler opHandlers[] = { handle_cublasCgemm3mStridedBatched_64, handle_cublasZgemmStridedBatched, handle_cublasZgemmStridedBatched_64, + nullptr, + handle_cublasGemmBatchedEx_64, handle_cublasSgeam, handle_cublasSgeam_64, handle_cublasDgeam, @@ -43924,6 +46030,14 @@ static RequestHandler opHandlers[] = { handle_cublasCgeam_64, handle_cublasZgeam, handle_cublasZgeam_64, + handle_cublasStrsmBatched, + handle_cublasStrsmBatched_64, + handle_cublasDtrsmBatched, + handle_cublasDtrsmBatched_64, + handle_cublasCtrsmBatched, + handle_cublasCtrsmBatched_64, + handle_cublasZtrsmBatched, + handle_cublasZtrsmBatched_64, handle_cublasSdgmm, handle_cublasSdgmm_64, handle_cublasDdgmm, @@ -43932,6 +46046,18 @@ static RequestHandler opHandlers[] = { handle_cublasCdgmm_64, handle_cublasZdgmm, handle_cublasZdgmm_64, + handle_cublasSmatinvBatched, + handle_cublasDmatinvBatched, + handle_cublasCmatinvBatched, + handle_cublasZmatinvBatched, + handle_cublasSgeqrfBatched, + handle_cublasDgeqrfBatched, + handle_cublasCgeqrfBatched, + handle_cublasZgeqrfBatched, + handle_cublasSgelsBatched, + handle_cublasDgelsBatched, + handle_cublasCgelsBatched, + handle_cublasZgelsBatched, handle_cublasStpttr, handle_cublasDtpttr, handle_cublasCtpttr, @@ -43940,6 +46066,14 @@ static RequestHandler opHandlers[] = { handle_cublasDtrttp, handle_cublasCtrttp, handle_cublasZtrttp, + handle_cublasSgetriBatched, + handle_cublasDgetriBatched, + handle_cublasCgetriBatched, + handle_cublasZgetriBatched, + handle_cublasSgetrsBatched, + handle_cublasDgetrsBatched, + handle_cublasCgetrsBatched, + handle_cublasZgetrsBatched, handle_cublasUint8gemmBias, nullptr, nullptr, diff --git a/codegen/manual_server.cpp b/codegen/manual_server.cpp index 2b1305c..cd2b0fa 100755 --- a/codegen/manual_server.cpp +++ b/codegen/manual_server.cpp @@ -137,7 +137,7 @@ int handle_cudaMemcpyAsync(void *conn) std::cerr << "Failed to allocate host memory for device-to-host transfer." << std::endl; return -1; } - + int request_id = rpc_end_request(conn); if (request_id < 0) { diff --git a/local.sh b/local.sh index 56fe3de..caa9c70 100755 --- a/local.sh +++ b/local.sh @@ -27,6 +27,8 @@ build() { nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o + if [ ! -f "$libscuda_path" ]; then echo "libscuda.so not found. build may have failed." exit 1 @@ -144,6 +146,23 @@ test_cudnn() { fi } +test_cublas_batched() { + output=$(LD_PRELOAD="$libscuda_path" ./cublas_batched.o | tail -n 5) + + expected_output=$'=====\nC[1]\n111.00 122.00\n151.00 166.00\n=====' + + # trim ugly output from the file + output=$(echo "$output" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + expected_output=$(echo "$expected_output" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + + if [[ "$output" == "$expected_output" ]]; then + ansi_format "pass" "$pass_message" + else + ansi_format "fail" "test_cublas_batched failed. Got [$output]." + return 1 + fi +} + #---- declare test cases ----# declare -A test_cuda_avail=( ["function"]="test_cuda_available" @@ -170,8 +189,13 @@ declare -A test_cudnn=( ["pass"]="cuDNN correctly applies sigmoid activation on a tensor." ) +declare -A test_cublas_batched=( + ["function"]="test_cublas_batched" + ["pass"]="Batched cublas works via test/cublas_batched.cu." +) + #---- assign them to our associative array ----# -tests=("test_cuda_avail" "test_tensor_to_cuda" "test_tensor_to_cuda_to_cpu" "test_vector_add" "test_cudnn") +tests=("test_cuda_avail" "test_tensor_to_cuda" "test_tensor_to_cuda_to_cpu" "test_vector_add" "test_cudnn" "test_cublas_batched") test() { build diff --git a/test/cublas_batched.cu b/test/cublas_batched.cu new file mode 100644 index 0000000..d5e3f92 --- /dev/null +++ b/test/cublas_batched.cu @@ -0,0 +1,196 @@ +/* + * Copyright 2020 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#include +#include +#include + +#include +#include + +#include "cublas_utils.h" + +using data_type = double; + +int main(int argc, char *argv[]) { + cublasHandle_t cublasH = NULL; + cudaStream_t stream = NULL; + + const int m = 2; + const int n = 2; + const int k = 2; + const int lda = 2; + const int ldb = 2; + const int ldc = 2; + const int batch_count = 2; + + /* + * A = | 1.0 | 2.0 | 5.0 | 6.0 | + * | 3.0 | 4.0 | 7.0 | 8.0 | + * + * B = | 5.0 | 6.0 | 9.0 | 10.0 | + * | 7.0 | 8.0 | 11.0 | 12.0 | + */ + + const std::vector> A_array = {{1.0 ,3.0, 2.0, 4.0}, + {5.0, 7.0, 6.0, 8.0}}; + const std::vector> B_array = {{5.0, 7.0, 6.0, 8.0}, + {9.0, 11.0, 10.0, 12.0}}; + std::vector> C_array(batch_count, std::vector(m * n)); + + const data_type alpha = 1.0; + const data_type beta = 0.0; + + data_type **d_A_array = nullptr; + data_type **d_B_array = nullptr; + data_type **d_C_array = nullptr; + + std::vector d_A(batch_count, nullptr); + std::vector d_B(batch_count, nullptr); + std::vector d_C(batch_count, nullptr); + + cublasOperation_t transa = CUBLAS_OP_N; + cublasOperation_t transb = CUBLAS_OP_N; + + printf("A[0]\n"); + print_matrix(m, k, A_array[0].data(), lda); + printf("=====\n"); + + printf("A[1]\n"); + print_matrix(m, k, A_array[1].data(), lda); + printf("=====\n"); + + printf("B[0]\n"); + print_matrix(k, n, B_array[0].data(), ldb); + printf("=====\n"); + + printf("B[1]\n"); + print_matrix(k, n, B_array[1].data(), ldb); + printf("=====\n"); + + /* step 1: create cublas handle, bind a stream */ + CUBLAS_CHECK(cublasCreate(&cublasH)); + + CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUBLAS_CHECK(cublasSetStream(cublasH, stream)); + + /* step 2: copy data to device */ + for (int i = 0; i < batch_count; i++) { + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * A_array[i].size())); + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_B[i]), sizeof(data_type) * B_array[i].size())); + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_C[i]), sizeof(data_type) * C_array[i].size())); + } + + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_A_array), sizeof(data_type *) * batch_count)); + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_B_array), sizeof(data_type *) * batch_count)); + CUDA_CHECK( + cudaMalloc(reinterpret_cast(&d_C_array), sizeof(data_type *) * batch_count)); + + for (int i = 0; i < batch_count; i++) { + CUDA_CHECK(cudaMemcpyAsync(d_A[i], A_array[i].data(), sizeof(data_type) * A_array[i].size(), + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_B[i], B_array[i].data(), sizeof(data_type) * B_array[i].size(), + cudaMemcpyHostToDevice, stream)); + } + + CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type *) * batch_count, + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type *) * batch_count, + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type *) * batch_count, + cudaMemcpyHostToDevice, stream)); + + /* step 3: compute */ + CUBLAS_CHECK(cublasDgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda, + d_B_array, ldb, &beta, d_C_array, ldc, batch_count)); + + /* step 4: copy data to host */ + for (int i = 0; i < batch_count; i++) { + CUDA_CHECK(cudaMemcpy(C_array[i].data(), d_C[i], sizeof(data_type) * C_array[i].size(), + cudaMemcpyDeviceToHost)); + } + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + /* + * C = | 19.0 | 22.0 | 111.0 | 122.0 | + * | 43.0 | 50.0 | 151.0 | 166.0 | + */ + + printf("C[0]\n"); + print_matrix(m, n, C_array[0].data(), ldc); + printf("=====\n"); + + printf("C[1]\n"); + print_matrix(m, n, C_array[1].data(), ldc); + printf("=====\n"); + + /* free resources */ + CUDA_CHECK(cudaFree(d_A_array)); + CUDA_CHECK(cudaFree(d_B_array)); + CUDA_CHECK(cudaFree(d_C_array)); + for (int i = 0; i < batch_count; i++) { + CUDA_CHECK(cudaFree(d_A[i])); + CUDA_CHECK(cudaFree(d_B[i])); + CUDA_CHECK(cudaFree(d_C[i])); + } + + CUBLAS_CHECK(cublasDestroy(cublasH)); + + CUDA_CHECK(cudaStreamDestroy(stream)); + + CUDA_CHECK(cudaDeviceReset()); + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/test/cublas_utils.h b/test/cublas_utils.h new file mode 100644 index 0000000..61b64ea --- /dev/null +++ b/test/cublas_utils.h @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +// CUDA API error checking +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("CUDA error"); \ + } \ + } while (0) + +// cublas API error checking +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("cublas error"); \ + } \ + } while (0) + +// memory alignment +#define ALIGN_TO(A, B) (((A + B - 1) / B) * B) + +// device memory pitch alignment +static const size_t device_alignment = 32; + +// type traits +template struct traits; + +template <> struct traits { + // scalar type + typedef float T; + typedef T S; + + static constexpr T zero = 0.f; + static constexpr cudaDataType cuda_data_type = CUDA_R_32F; + + inline static S abs(T val) { return fabs(val); } + + template inline static T rand(RNG &gen) { return (S)gen(); } + + inline static T add(T a, T b) { return a + b; } + + inline static T mul(T v, double f) { return v * f; } +}; + +template <> struct traits { + // scalar type + typedef double T; + typedef T S; + + static constexpr T zero = 0.; + static constexpr cudaDataType cuda_data_type = CUDA_R_64F; + + inline static S abs(T val) { return fabs(val); } + + template inline static T rand(RNG &gen) { return (S)gen(); } + + inline static T add(T a, T b) { return a + b; } + + inline static T mul(T v, double f) { return v * f; } +}; + +template <> struct traits { + // scalar type + typedef float S; + typedef cuFloatComplex T; + + static constexpr T zero = {0.f, 0.f}; + static constexpr cudaDataType cuda_data_type = CUDA_C_32F; + + inline static S abs(T val) { return cuCabsf(val); } + + template inline static T rand(RNG &gen) { + return make_cuFloatComplex((S)gen(), (S)gen()); + } + + inline static T add(T a, T b) { return cuCaddf(a, b); } + inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); } + + inline static T mul(T v, double f) { return make_cuFloatComplex(v.x * f, v.y * f); } +}; + +template <> struct traits { + // scalar type + typedef double S; + typedef cuDoubleComplex T; + + static constexpr T zero = {0., 0.}; + static constexpr cudaDataType cuda_data_type = CUDA_C_64F; + + inline static S abs(T val) { return cuCabs(val); } + + template inline static T rand(RNG &gen) { + return make_cuDoubleComplex((S)gen(), (S)gen()); + } + + inline static T add(T a, T b) { return cuCadd(a, b); } + inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); } + + inline static T mul(T v, double f) { return make_cuDoubleComplex(v.x * f, v.y * f); } +}; + +template void print_matrix(const int &m, const int &n, const T *A, const int &lda); + +template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f ", A[j * lda + i]); + } + std::printf("\n"); + } +} + +template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f ", A[j * lda + i]); + } + std::printf("\n"); + } +} + +template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); + } + std::printf("\n"); + } +} + +template <> +void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); + } + std::printf("\n"); + } +} + +template void print_packed_matrix(cublasFillMode_t uplo, const int &n, const T *A); + +template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const float *A) { + size_t off = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) || + (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) { + std::printf("%6.2f ", A[off++]); + } else if (uplo == CUBLAS_FILL_MODE_UPPER) { + std::printf(" "); + } + } + std::printf("\n"); + } +} + +template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const double *A) { + size_t off = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) || + (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) { + std::printf("%6.2f ", A[off++]); + } else if (uplo == CUBLAS_FILL_MODE_UPPER) { + std::printf(" "); + } + } + std::printf("\n"); + } +} + +template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const cuComplex *A) { + size_t off = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) || + (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) { + std::printf("%6.2f + %6.2fj ", A[off].x, A[off].y); + off++; + } else if (uplo == CUBLAS_FILL_MODE_UPPER) { + std::printf(" "); + } + } + std::printf("\n"); + } +} + +template <> void print_packed_matrix(cublasFillMode_t uplo, const int &n, const cuDoubleComplex *A) { + size_t off = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if ((uplo == CUBLAS_FILL_MODE_UPPER && j >= i) || + (uplo == CUBLAS_FILL_MODE_LOWER && j <= i)) { + std::printf("%6.2f + %6.2fj ", A[off].x, A[off].y); + off++; + } else if (uplo == CUBLAS_FILL_MODE_UPPER) { + std::printf(" "); + } + } + std::printf("\n"); + } +} + +template void print_vector(const int &m, const T *A); + +template <> void print_vector(const int &m, const float *A) { + for (int i = 0; i < m; i++) { + std::printf("%0.2f ", A[i]); + } + std::printf("\n"); +} + +template <> void print_vector(const int &m, const double *A) { + for (int i = 0; i < m; i++) { + std::printf("%0.2f ", A[i]); + } + std::printf("\n"); +} + +template <> void print_vector(const int &m, const cuComplex *A) { + for (int i = 0; i < m; i++) { + std::printf("%0.2f + %0.2fj ", A[i].x, A[i].y); + } + std::printf("\n"); +} + +template <> void print_vector(const int &m, const cuDoubleComplex *A) { + for (int i = 0; i < m; i++) { + std::printf("%0.2f + %0.2fj ", A[i].x, A[i].y); + } + std::printf("\n"); +} + +template void generate_random_matrix(int m, int n, T **A, int *lda) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution::S> dis(-1.0, 1.0); + auto rand_gen = std::bind(dis, gen); + + *lda = n; + + size_t matrix_mem_size = static_cast(*lda * m * sizeof(T)); + // suppress gcc 7 size warning + if (matrix_mem_size <= PTRDIFF_MAX) + *A = (T *)malloc(matrix_mem_size); + else + throw std::runtime_error("Memory allocation size is too large"); + + if (*A == NULL) + throw std::runtime_error("Unable to allocate host matrix"); + + // random matrix and accumulate row sums + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + T *A_row = (*A) + *lda * i; + A_row[j] = traits::rand(rand_gen); + } + } +} + +// Makes matrix A of size mxn and leading dimension lda diagonal dominant +template void make_diag_dominant_matrix(int m, int n, T *A, int lda) { + for (int i = 0; i < std::min(m, n); ++i) { + T *A_row = A + lda * i; + auto row_sum = traits::S>::zero; + for (int j = 0; j < n; ++j) { + row_sum += traits::abs(A_row[j]); + } + A_row[i] = traits::add(A_row[i], row_sum); + } +} + +// Returns cudaDataType value as defined in library_types.h for the string +// containing type name +cudaDataType get_cuda_library_type(std::string type_string) { + if (type_string.compare("CUDA_R_16F") == 0) + return CUDA_R_16F; + else if (type_string.compare("CUDA_C_16F") == 0) + return CUDA_C_16F; + else if (type_string.compare("CUDA_R_32F") == 0) + return CUDA_R_32F; + else if (type_string.compare("CUDA_C_32F") == 0) + return CUDA_C_32F; + else if (type_string.compare("CUDA_R_64F") == 0) + return CUDA_R_64F; + else if (type_string.compare("CUDA_C_64F") == 0) + return CUDA_C_64F; + else if (type_string.compare("CUDA_R_8I") == 0) + return CUDA_R_8I; + else if (type_string.compare("CUDA_C_8I") == 0) + return CUDA_C_8I; + else if (type_string.compare("CUDA_R_8U") == 0) + return CUDA_R_8U; + else if (type_string.compare("CUDA_C_8U") == 0) + return CUDA_C_8U; + else if (type_string.compare("CUDA_R_32I") == 0) + return CUDA_R_32I; + else if (type_string.compare("CUDA_C_32I") == 0) + return CUDA_C_32I; + else if (type_string.compare("CUDA_R_32U") == 0) + return CUDA_R_32U; + else if (type_string.compare("CUDA_C_32U") == 0) + return CUDA_C_32U; + else + throw std::runtime_error("Unknown CUDA datatype"); +} \ No newline at end of file