diff --git a/codegen/codegen.py b/codegen/codegen.py index 99961bd..67bac57 100644 --- a/codegen/codegen.py +++ b/codegen/codegen.py @@ -197,7 +197,7 @@ def client_rpc_write(self, f): f.write( " rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format( param_name=self.parameter.name, - param_type=self.parameter.name, + param_type=self.ptr.array_of.format(), ) ) else: @@ -243,11 +243,10 @@ def client_unified_copy(self, f, direction, error): @property def server_declaration(self) -> str: if isinstance(self.ptr, Array): - c = self.ptr.const - self.ptr.const = False - # const[] isn't a valid part of a variable declaration - s = f" {self.ptr.format().replace("const[]", "")}* {self.parameter.name} = nullptr;\n" - self.ptr.const = c + c = self.ptr.array_of.const + self.ptr.array_of.const = False + s = f" {self.ptr.array_of.format()}* {self.parameter.name} = nullptr;\n" + self.ptr.array_of.const = c else: c = self.ptr.ptr_to.const self.ptr.ptr_to.const = False @@ -281,9 +280,9 @@ def server_rpc_read(self, f, index) -> Optional[str]: ) elif isinstance(self.ptr, Array): f.write( - " rpc_read(conn, &{param_name}, sizeof({param_type})) < 0 ||\n".format( + " rpc_read(conn, &{param_name}, sizeof({param_type}*)) < 0 ||\n".format( param_name=self.parameter.name, - param_type=self.ptr.format().replace("[]", ""), + param_type=self.ptr.array_of.format(), ) ) else: @@ -690,7 +689,7 @@ def parse_annotation(annotation: str, params: list[Parameter]) -> list[tuple[Ope )) elif isinstance(param.type, Array): length_param = next(p for p in params if p.name == length_arg.split(":")[1]) - if param.type.const: + if param.type.array_of.const: recv = False operations.append(ArrayOperation( send=send, diff --git a/codegen/gen_api.h b/codegen/gen_api.h index 502f648..2545cf2 100644 --- a/codegen/gen_api.h +++ b/codegen/gen_api.h @@ -1156,213 +1156,256 @@ #define RPC_cublasChpr2_v2_64 1155 #define RPC_cublasZhpr2_v2 1156 #define RPC_cublasZhpr2_v2_64 1157 -#define RPC_cublasSgemvStridedBatched 1158 -#define RPC_cublasSgemvStridedBatched_64 1159 -#define RPC_cublasDgemvStridedBatched 1160 -#define RPC_cublasDgemvStridedBatched_64 1161 -#define RPC_cublasCgemvStridedBatched 1162 -#define RPC_cublasCgemvStridedBatched_64 1163 -#define RPC_cublasZgemvStridedBatched 1164 -#define RPC_cublasZgemvStridedBatched_64 1165 -#define RPC_cublasHSHgemvStridedBatched 1166 -#define RPC_cublasHSHgemvStridedBatched_64 1167 -#define RPC_cublasHSSgemvStridedBatched 1168 -#define RPC_cublasHSSgemvStridedBatched_64 1169 -#define RPC_cublasTSTgemvStridedBatched 1170 -#define RPC_cublasTSTgemvStridedBatched_64 1171 -#define RPC_cublasTSSgemvStridedBatched 1172 -#define RPC_cublasTSSgemvStridedBatched_64 1173 -#define RPC_cublasSgemm_v2 1174 -#define RPC_cublasSgemm_v2_64 1175 -#define RPC_cublasDgemm_v2 1176 -#define RPC_cublasDgemm_v2_64 1177 -#define RPC_cublasCgemm_v2 1178 -#define RPC_cublasCgemm_v2_64 1179 -#define RPC_cublasCgemm3m 1180 -#define RPC_cublasCgemm3m_64 1181 -#define RPC_cublasZgemm_v2 1182 -#define RPC_cublasZgemm_v2_64 1183 -#define RPC_cublasZgemm3m 1184 -#define RPC_cublasZgemm3m_64 1185 -#define RPC_cublasHgemm 1186 -#define RPC_cublasHgemm_64 1187 -#define RPC_cublasSsyrk_v2 1188 -#define RPC_cublasSsyrk_v2_64 1189 -#define RPC_cublasDsyrk_v2 1190 -#define RPC_cublasDsyrk_v2_64 1191 -#define RPC_cublasCsyrk_v2 1192 -#define RPC_cublasCsyrk_v2_64 1193 -#define RPC_cublasZsyrk_v2 1194 -#define RPC_cublasZsyrk_v2_64 1195 -#define RPC_cublasCherk_v2 1196 -#define RPC_cublasCherk_v2_64 1197 -#define RPC_cublasZherk_v2 1198 -#define RPC_cublasZherk_v2_64 1199 -#define RPC_cublasSsyr2k_v2 1200 -#define RPC_cublasSsyr2k_v2_64 1201 -#define RPC_cublasDsyr2k_v2 1202 -#define RPC_cublasDsyr2k_v2_64 1203 -#define RPC_cublasCsyr2k_v2 1204 -#define RPC_cublasCsyr2k_v2_64 1205 -#define RPC_cublasZsyr2k_v2 1206 -#define RPC_cublasZsyr2k_v2_64 1207 -#define RPC_cublasCher2k_v2 1208 -#define RPC_cublasCher2k_v2_64 1209 -#define RPC_cublasZher2k_v2 1210 -#define RPC_cublasZher2k_v2_64 1211 -#define RPC_cublasSsyrkx 1212 -#define RPC_cublasSsyrkx_64 1213 -#define RPC_cublasDsyrkx 1214 -#define RPC_cublasDsyrkx_64 1215 -#define RPC_cublasCsyrkx 1216 -#define RPC_cublasCsyrkx_64 1217 -#define RPC_cublasZsyrkx 1218 -#define RPC_cublasZsyrkx_64 1219 -#define RPC_cublasCherkx 1220 -#define RPC_cublasCherkx_64 1221 -#define RPC_cublasZherkx 1222 -#define RPC_cublasZherkx_64 1223 -#define RPC_cublasSsymm_v2 1224 -#define RPC_cublasSsymm_v2_64 1225 -#define RPC_cublasDsymm_v2 1226 -#define RPC_cublasDsymm_v2_64 1227 -#define RPC_cublasCsymm_v2 1228 -#define RPC_cublasCsymm_v2_64 1229 -#define RPC_cublasZsymm_v2 1230 -#define RPC_cublasZsymm_v2_64 1231 -#define RPC_cublasChemm_v2 1232 -#define RPC_cublasChemm_v2_64 1233 -#define RPC_cublasZhemm_v2 1234 -#define RPC_cublasZhemm_v2_64 1235 -#define RPC_cublasStrsm_v2 1236 -#define RPC_cublasStrsm_v2_64 1237 -#define RPC_cublasDtrsm_v2 1238 -#define RPC_cublasDtrsm_v2_64 1239 -#define RPC_cublasCtrsm_v2 1240 -#define RPC_cublasCtrsm_v2_64 1241 -#define RPC_cublasZtrsm_v2 1242 -#define RPC_cublasZtrsm_v2_64 1243 -#define RPC_cublasStrmm_v2 1244 -#define RPC_cublasStrmm_v2_64 1245 -#define RPC_cublasDtrmm_v2 1246 -#define RPC_cublasDtrmm_v2_64 1247 -#define RPC_cublasCtrmm_v2 1248 -#define RPC_cublasCtrmm_v2_64 1249 -#define RPC_cublasZtrmm_v2 1250 -#define RPC_cublasZtrmm_v2_64 1251 -#define RPC_cublasHgemmStridedBatched 1252 -#define RPC_cublasHgemmStridedBatched_64 1253 -#define RPC_cublasSgemmStridedBatched 1254 -#define RPC_cublasSgemmStridedBatched_64 1255 -#define RPC_cublasDgemmStridedBatched 1256 -#define RPC_cublasDgemmStridedBatched_64 1257 -#define RPC_cublasCgemmStridedBatched 1258 -#define RPC_cublasCgemmStridedBatched_64 1259 -#define RPC_cublasCgemm3mStridedBatched 1260 -#define RPC_cublasCgemm3mStridedBatched_64 1261 -#define RPC_cublasZgemmStridedBatched 1262 -#define RPC_cublasZgemmStridedBatched_64 1263 -#define RPC_cublasGemmBatchedEx 1264 -#define RPC_cublasSgeam 1265 -#define RPC_cublasSgeam_64 1266 -#define RPC_cublasDgeam 1267 -#define RPC_cublasDgeam_64 1268 -#define RPC_cublasCgeam 1269 -#define RPC_cublasCgeam_64 1270 -#define RPC_cublasZgeam 1271 -#define RPC_cublasZgeam_64 1272 -#define RPC_cublasSdgmm 1273 -#define RPC_cublasSdgmm_64 1274 -#define RPC_cublasDdgmm 1275 -#define RPC_cublasDdgmm_64 1276 -#define RPC_cublasCdgmm 1277 -#define RPC_cublasCdgmm_64 1278 -#define RPC_cublasZdgmm 1279 -#define RPC_cublasZdgmm_64 1280 -#define RPC_cublasStpttr 1281 -#define RPC_cublasDtpttr 1282 -#define RPC_cublasCtpttr 1283 -#define RPC_cublasZtpttr 1284 -#define RPC_cublasStrttp 1285 -#define RPC_cublasDtrttp 1286 -#define RPC_cublasCtrttp 1287 -#define RPC_cublasZtrttp 1288 -#define RPC_cublasUint8gemmBias 1289 -#define RPC_cublasMigrateComputeType 1290 -#define RPC_cudnnGetVersion 1291 -#define RPC_cudnnGetMaxDeviceVersion 1292 -#define RPC_cudnnGetCudartVersion 1293 -#define RPC_cudnnGetErrorString 1294 -#define RPC_cudnnGetLastErrorString 1295 -#define RPC_cudnnQueryRuntimeError 1296 -#define RPC_cudnnGetProperty 1297 -#define RPC_cudnnCreate 1298 -#define RPC_cudnnDestroy 1299 -#define RPC_cudnnSetStream 1300 -#define RPC_cudnnGetStream 1301 -#define RPC_cudnnGetCallback 1302 -#define RPC_cudnnGraphVersionCheck 1303 -#define RPC_cudnnBackendCreateDescriptor 1304 -#define RPC_cudnnBackendDestroyDescriptor 1305 -#define RPC_cudnnBackendInitialize 1306 -#define RPC_cudnnBackendFinalize 1307 -#define RPC_cudnnBackendSetAttribute 1308 -#define RPC_cudnnBackendExecute 1309 -#define RPC_cudnnBackendPopulateCudaGraph 1310 -#define RPC_cudnnBackendUpdateCudaGraph 1311 -#define RPC_cudnnCreateTensorDescriptor 1312 -#define RPC_cudnnSetTensor4dDescriptor 1313 -#define RPC_cudnnSetTensor4dDescriptorEx 1314 -#define RPC_cudnnGetTensor4dDescriptor 1315 -#define RPC_cudnnGetTensorSizeInBytes 1316 -#define RPC_cudnnDestroyTensorDescriptor 1317 -#define RPC_cudnnInitTransformDest 1318 -#define RPC_cudnnCreateTensorTransformDescriptor 1319 -#define RPC_cudnnDestroyTensorTransformDescriptor 1320 -#define RPC_cudnnCreateOpTensorDescriptor 1321 -#define RPC_cudnnSetOpTensorDescriptor 1322 -#define RPC_cudnnGetOpTensorDescriptor 1323 -#define RPC_cudnnDestroyOpTensorDescriptor 1324 -#define RPC_cudnnCreateReduceTensorDescriptor 1325 -#define RPC_cudnnSetReduceTensorDescriptor 1326 -#define RPC_cudnnGetReduceTensorDescriptor 1327 -#define RPC_cudnnDestroyReduceTensorDescriptor 1328 -#define RPC_cudnnGetReductionIndicesSize 1329 -#define RPC_cudnnGetReductionWorkspaceSize 1330 -#define RPC_cudnnCreateFilterDescriptor 1331 -#define RPC_cudnnSetFilter4dDescriptor 1332 -#define RPC_cudnnGetFilter4dDescriptor 1333 -#define RPC_cudnnGetFilterSizeInBytes 1334 -#define RPC_cudnnDestroyFilterDescriptor 1335 -#define RPC_cudnnCreatePoolingDescriptor 1336 -#define RPC_cudnnSetPooling2dDescriptor 1337 -#define RPC_cudnnGetPooling2dDescriptor 1338 -#define RPC_cudnnGetPooling2dForwardOutputDim 1339 -#define RPC_cudnnDestroyPoolingDescriptor 1340 -#define RPC_cudnnCreateActivationDescriptor 1341 -#define RPC_cudnnSetActivationDescriptor 1342 -#define RPC_cudnnGetActivationDescriptor 1343 -#define RPC_cudnnSetActivationDescriptorSwishBeta 1344 -#define RPC_cudnnGetActivationDescriptorSwishBeta 1345 -#define RPC_cudnnDestroyActivationDescriptor 1346 -#define RPC_cudnnActivationForward 1347 -#define RPC_cudnnCreateLRNDescriptor 1348 -#define RPC_cudnnSetLRNDescriptor 1349 -#define RPC_cudnnGetLRNDescriptor 1350 -#define RPC_cudnnDestroyLRNDescriptor 1351 -#define RPC_cudnnDeriveBNTensorDescriptor 1352 -#define RPC_cudnnDeriveNormTensorDescriptor 1353 -#define RPC_cudnnCreateSpatialTransformerDescriptor 1354 -#define RPC_cudnnDestroySpatialTransformerDescriptor 1355 -#define RPC_cudnnCreateDropoutDescriptor 1356 -#define RPC_cudnnDestroyDropoutDescriptor 1357 -#define RPC_cudnnDropoutGetStatesSize 1358 -#define RPC_cudnnDropoutGetReserveSpaceSize 1359 -#define RPC_cudnnGetDropoutDescriptor 1360 -#define RPC_cudnnOpsVersionCheck 1361 -#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1362 -#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1363 -#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1364 -#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1365 -#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1366 -#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1367 +#define RPC_cublasSgemvBatched 1158 +#define RPC_cublasTSTgemvBatched 1159 +#define RPC_cublasSgemvStridedBatched 1160 +#define RPC_cublasSgemvStridedBatched_64 1161 +#define RPC_cublasDgemvStridedBatched 1162 +#define RPC_cublasDgemvStridedBatched_64 1163 +#define RPC_cublasCgemvStridedBatched 1164 +#define RPC_cublasCgemvStridedBatched_64 1165 +#define RPC_cublasZgemvStridedBatched 1166 +#define RPC_cublasZgemvStridedBatched_64 1167 +#define RPC_cublasHSHgemvStridedBatched 1168 +#define RPC_cublasHSHgemvStridedBatched_64 1169 +#define RPC_cublasHSSgemvStridedBatched 1170 +#define RPC_cublasHSSgemvStridedBatched_64 1171 +#define RPC_cublasTSTgemvStridedBatched 1172 +#define RPC_cublasTSTgemvStridedBatched_64 1173 +#define RPC_cublasTSSgemvStridedBatched 1174 +#define RPC_cublasTSSgemvStridedBatched_64 1175 +#define RPC_cublasSgemm_v2 1176 +#define RPC_cublasSgemm_v2_64 1177 +#define RPC_cublasDgemm_v2 1178 +#define RPC_cublasDgemm_v2_64 1179 +#define RPC_cublasCgemm_v2 1180 +#define RPC_cublasCgemm_v2_64 1181 +#define RPC_cublasCgemm3m 1182 +#define RPC_cublasCgemm3m_64 1183 +#define RPC_cublasZgemm_v2 1184 +#define RPC_cublasZgemm_v2_64 1185 +#define RPC_cublasZgemm3m 1186 +#define RPC_cublasZgemm3m_64 1187 +#define RPC_cublasHgemm 1188 +#define RPC_cublasHgemm_64 1189 +#define RPC_cublasSsyrk_v2 1190 +#define RPC_cublasSsyrk_v2_64 1191 +#define RPC_cublasDsyrk_v2 1192 +#define RPC_cublasDsyrk_v2_64 1193 +#define RPC_cublasCsyrk_v2 1194 +#define RPC_cublasCsyrk_v2_64 1195 +#define RPC_cublasZsyrk_v2 1196 +#define RPC_cublasZsyrk_v2_64 1197 +#define RPC_cublasCherk_v2 1198 +#define RPC_cublasCherk_v2_64 1199 +#define RPC_cublasZherk_v2 1200 +#define RPC_cublasZherk_v2_64 1201 +#define RPC_cublasSsyr2k_v2 1202 +#define RPC_cublasSsyr2k_v2_64 1203 +#define RPC_cublasDsyr2k_v2 1204 +#define RPC_cublasDsyr2k_v2_64 1205 +#define RPC_cublasCsyr2k_v2 1206 +#define RPC_cublasCsyr2k_v2_64 1207 +#define RPC_cublasZsyr2k_v2 1208 +#define RPC_cublasZsyr2k_v2_64 1209 +#define RPC_cublasCher2k_v2 1210 +#define RPC_cublasCher2k_v2_64 1211 +#define RPC_cublasZher2k_v2 1212 +#define RPC_cublasZher2k_v2_64 1213 +#define RPC_cublasSsyrkx 1214 +#define RPC_cublasSsyrkx_64 1215 +#define RPC_cublasDsyrkx 1216 +#define RPC_cublasDsyrkx_64 1217 +#define RPC_cublasCsyrkx 1218 +#define RPC_cublasCsyrkx_64 1219 +#define RPC_cublasZsyrkx 1220 +#define RPC_cublasZsyrkx_64 1221 +#define RPC_cublasCherkx 1222 +#define RPC_cublasCherkx_64 1223 +#define RPC_cublasZherkx 1224 +#define RPC_cublasZherkx_64 1225 +#define RPC_cublasSsymm_v2 1226 +#define RPC_cublasSsymm_v2_64 1227 +#define RPC_cublasDsymm_v2 1228 +#define RPC_cublasDsymm_v2_64 1229 +#define RPC_cublasCsymm_v2 1230 +#define RPC_cublasCsymm_v2_64 1231 +#define RPC_cublasZsymm_v2 1232 +#define RPC_cublasZsymm_v2_64 1233 +#define RPC_cublasChemm_v2 1234 +#define RPC_cublasChemm_v2_64 1235 +#define RPC_cublasZhemm_v2 1236 +#define RPC_cublasZhemm_v2_64 1237 +#define RPC_cublasStrsm_v2 1238 +#define RPC_cublasStrsm_v2_64 1239 +#define RPC_cublasDtrsm_v2 1240 +#define RPC_cublasDtrsm_v2_64 1241 +#define RPC_cublasCtrsm_v2 1242 +#define RPC_cublasCtrsm_v2_64 1243 +#define RPC_cublasZtrsm_v2 1244 +#define RPC_cublasZtrsm_v2_64 1245 +#define RPC_cublasStrmm_v2 1246 +#define RPC_cublasStrmm_v2_64 1247 +#define RPC_cublasDtrmm_v2 1248 +#define RPC_cublasDtrmm_v2_64 1249 +#define RPC_cublasCtrmm_v2 1250 +#define RPC_cublasCtrmm_v2_64 1251 +#define RPC_cublasZtrmm_v2 1252 +#define RPC_cublasZtrmm_v2_64 1253 +#define RPC_cublasHgemmBatched 1254 +#define RPC_cublasHgemmBatched_64 1255 +#define RPC_cublasSgemmBatched 1256 +#define RPC_cublasSgemmBatched_64 1257 +#define RPC_cublasDgemmBatched 1258 +#define RPC_cublasDgemmBatched_64 1259 +#define RPC_cublasCgemmBatched 1260 +#define RPC_cublasCgemmBatched_64 1261 +#define RPC_cublasCgemm3mBatched 1262 +#define RPC_cublasCgemm3mBatched_64 1263 +#define RPC_cublasZgemmBatched 1264 +#define RPC_cublasZgemmBatched_64 1265 +#define RPC_cublasHgemmStridedBatched 1266 +#define RPC_cublasHgemmStridedBatched_64 1267 +#define RPC_cublasSgemmStridedBatched 1268 +#define RPC_cublasSgemmStridedBatched_64 1269 +#define RPC_cublasDgemmStridedBatched 1270 +#define RPC_cublasDgemmStridedBatched_64 1271 +#define RPC_cublasCgemmStridedBatched 1272 +#define RPC_cublasCgemmStridedBatched_64 1273 +#define RPC_cublasCgemm3mStridedBatched 1274 +#define RPC_cublasCgemm3mStridedBatched_64 1275 +#define RPC_cublasZgemmStridedBatched 1276 +#define RPC_cublasZgemmStridedBatched_64 1277 +#define RPC_cublasGemmBatchedEx 1278 +#define RPC_cublasGemmBatchedEx_64 1279 +#define RPC_cublasSgeam 1280 +#define RPC_cublasSgeam_64 1281 +#define RPC_cublasDgeam 1282 +#define RPC_cublasDgeam_64 1283 +#define RPC_cublasCgeam 1284 +#define RPC_cublasCgeam_64 1285 +#define RPC_cublasZgeam 1286 +#define RPC_cublasZgeam_64 1287 +#define RPC_cublasStrsmBatched 1288 +#define RPC_cublasStrsmBatched_64 1289 +#define RPC_cublasDtrsmBatched 1290 +#define RPC_cublasDtrsmBatched_64 1291 +#define RPC_cublasCtrsmBatched 1292 +#define RPC_cublasCtrsmBatched_64 1293 +#define RPC_cublasZtrsmBatched 1294 +#define RPC_cublasZtrsmBatched_64 1295 +#define RPC_cublasSdgmm 1296 +#define RPC_cublasSdgmm_64 1297 +#define RPC_cublasDdgmm 1298 +#define RPC_cublasDdgmm_64 1299 +#define RPC_cublasCdgmm 1300 +#define RPC_cublasCdgmm_64 1301 +#define RPC_cublasZdgmm 1302 +#define RPC_cublasZdgmm_64 1303 +#define RPC_cublasSmatinvBatched 1304 +#define RPC_cublasDmatinvBatched 1305 +#define RPC_cublasCmatinvBatched 1306 +#define RPC_cublasZmatinvBatched 1307 +#define RPC_cublasSgeqrfBatched 1308 +#define RPC_cublasDgeqrfBatched 1309 +#define RPC_cublasCgeqrfBatched 1310 +#define RPC_cublasZgeqrfBatched 1311 +#define RPC_cublasSgelsBatched 1312 +#define RPC_cublasDgelsBatched 1313 +#define RPC_cublasCgelsBatched 1314 +#define RPC_cublasZgelsBatched 1315 +#define RPC_cublasStpttr 1316 +#define RPC_cublasDtpttr 1317 +#define RPC_cublasCtpttr 1318 +#define RPC_cublasZtpttr 1319 +#define RPC_cublasStrttp 1320 +#define RPC_cublasDtrttp 1321 +#define RPC_cublasCtrttp 1322 +#define RPC_cublasZtrttp 1323 +#define RPC_cublasSgetriBatched 1324 +#define RPC_cublasDgetriBatched 1325 +#define RPC_cublasCgetriBatched 1326 +#define RPC_cublasZgetriBatched 1327 +#define RPC_cublasSgetrsBatched 1328 +#define RPC_cublasDgetrsBatched 1329 +#define RPC_cublasCgetrsBatched 1330 +#define RPC_cublasZgetrsBatched 1331 +#define RPC_cublasUint8gemmBias 1332 +#define RPC_cublasMigrateComputeType 1333 +#define RPC_cudnnGetVersion 1334 +#define RPC_cudnnGetMaxDeviceVersion 1335 +#define RPC_cudnnGetCudartVersion 1336 +#define RPC_cudnnGetErrorString 1337 +#define RPC_cudnnGetLastErrorString 1338 +#define RPC_cudnnQueryRuntimeError 1339 +#define RPC_cudnnGetProperty 1340 +#define RPC_cudnnCreate 1341 +#define RPC_cudnnDestroy 1342 +#define RPC_cudnnSetStream 1343 +#define RPC_cudnnGetStream 1344 +#define RPC_cudnnGetCallback 1345 +#define RPC_cudnnGraphVersionCheck 1346 +#define RPC_cudnnBackendCreateDescriptor 1347 +#define RPC_cudnnBackendDestroyDescriptor 1348 +#define RPC_cudnnBackendInitialize 1349 +#define RPC_cudnnBackendFinalize 1350 +#define RPC_cudnnBackendSetAttribute 1351 +#define RPC_cudnnBackendExecute 1352 +#define RPC_cudnnBackendPopulateCudaGraph 1353 +#define RPC_cudnnBackendUpdateCudaGraph 1354 +#define RPC_cudnnCreateTensorDescriptor 1355 +#define RPC_cudnnSetTensor4dDescriptor 1356 +#define RPC_cudnnSetTensor4dDescriptorEx 1357 +#define RPC_cudnnGetTensor4dDescriptor 1358 +#define RPC_cudnnGetTensorSizeInBytes 1359 +#define RPC_cudnnDestroyTensorDescriptor 1360 +#define RPC_cudnnInitTransformDest 1361 +#define RPC_cudnnCreateTensorTransformDescriptor 1362 +#define RPC_cudnnDestroyTensorTransformDescriptor 1363 +#define RPC_cudnnCreateOpTensorDescriptor 1364 +#define RPC_cudnnSetOpTensorDescriptor 1365 +#define RPC_cudnnGetOpTensorDescriptor 1366 +#define RPC_cudnnDestroyOpTensorDescriptor 1367 +#define RPC_cudnnCreateReduceTensorDescriptor 1368 +#define RPC_cudnnSetReduceTensorDescriptor 1369 +#define RPC_cudnnGetReduceTensorDescriptor 1370 +#define RPC_cudnnDestroyReduceTensorDescriptor 1371 +#define RPC_cudnnGetReductionIndicesSize 1372 +#define RPC_cudnnGetReductionWorkspaceSize 1373 +#define RPC_cudnnCreateFilterDescriptor 1374 +#define RPC_cudnnSetFilter4dDescriptor 1375 +#define RPC_cudnnGetFilter4dDescriptor 1376 +#define RPC_cudnnGetFilterSizeInBytes 1377 +#define RPC_cudnnDestroyFilterDescriptor 1378 +#define RPC_cudnnCreatePoolingDescriptor 1379 +#define RPC_cudnnSetPooling2dDescriptor 1380 +#define RPC_cudnnGetPooling2dDescriptor 1381 +#define RPC_cudnnGetPooling2dForwardOutputDim 1382 +#define RPC_cudnnDestroyPoolingDescriptor 1383 +#define RPC_cudnnCreateActivationDescriptor 1384 +#define RPC_cudnnSetActivationDescriptor 1385 +#define RPC_cudnnGetActivationDescriptor 1386 +#define RPC_cudnnSetActivationDescriptorSwishBeta 1387 +#define RPC_cudnnGetActivationDescriptorSwishBeta 1388 +#define RPC_cudnnDestroyActivationDescriptor 1389 +#define RPC_cudnnActivationForward 1390 +#define RPC_cudnnCreateLRNDescriptor 1391 +#define RPC_cudnnSetLRNDescriptor 1392 +#define RPC_cudnnGetLRNDescriptor 1393 +#define RPC_cudnnDestroyLRNDescriptor 1394 +#define RPC_cudnnDeriveBNTensorDescriptor 1395 +#define RPC_cudnnDeriveNormTensorDescriptor 1396 +#define RPC_cudnnCreateSpatialTransformerDescriptor 1397 +#define RPC_cudnnDestroySpatialTransformerDescriptor 1398 +#define RPC_cudnnCreateDropoutDescriptor 1399 +#define RPC_cudnnDestroyDropoutDescriptor 1400 +#define RPC_cudnnDropoutGetStatesSize 1401 +#define RPC_cudnnDropoutGetReserveSpaceSize 1402 +#define RPC_cudnnGetDropoutDescriptor 1403 +#define RPC_cudnnOpsVersionCheck 1404 +#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1405 +#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1406 +#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1407 +#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1408 +#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1409 +#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1410 diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index dfcfe4d..b9979f5 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -35254,6 +35254,234 @@ cublasStatus_t cublasZhpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, return return_value; } +cublasStatus_t +cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, + const float *alpha, const float *const Aarray[], int lda, + const float *const xarray[], int incx, const float *beta, + float *const yarray[], int incy, int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)xarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)xarray); + i++) + if (maybe_copy_unified_arg(0, (void *)xarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)yarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)yarray); + i++) + if (maybe_copy_unified_arg(0, (void *)yarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incy, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgemvBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float *)) < 0 || + rpc_write(0, &Aarray, sizeof(const float *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &xarray, sizeof(const float *const)) < 0 || + rpc_write(0, &incx, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const float *)) < 0 || + rpc_write(0, &yarray, sizeof(float *const)) < 0 || + rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)xarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)xarray); + i++) + if (maybe_copy_unified_arg(0, (void *)xarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)yarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)yarray); + i++) + if (maybe_copy_unified_arg(0, (void *)yarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incy, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasTSTgemvBatched( + cublasHandle_t handle, cublasOperation_t trans, int m, int n, + const float *alpha, const __nv_bfloat16 *const Aarray[], int lda, + const __nv_bfloat16 *const xarray[], int incx, const float *beta, + __nv_bfloat16 *const yarray[], int incy, int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)xarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)xarray); + i++) + if (maybe_copy_unified_arg(0, (void *)xarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)yarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)yarray); + i++) + if (maybe_copy_unified_arg(0, (void *)yarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incy, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasTSTgemvBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float *)) < 0 || + rpc_write(0, &Aarray, sizeof(const __nv_bfloat16 *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &xarray, sizeof(const __nv_bfloat16 *const)) < 0 || + rpc_write(0, &incx, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const float *)) < 0 || + rpc_write(0, &yarray, sizeof(__nv_bfloat16 *const)) < 0 || + rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)xarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)xarray); + i++) + if (maybe_copy_unified_arg(0, (void *)xarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)yarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)yarray); + i++) + if (maybe_copy_unified_arg(0, (void *)yarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incy, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + cublasStatus_t cublasSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, @@ -42823,12 +43051,14 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, } cublasStatus_t -cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const __half *alpha, const __half *A, int lda, - long long int strideA, const __half *B, int ldb, - long long int strideB, const __half *beta, __half *C, - int ldc, long long int strideC, int batchCount) { +cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const __half *alpha, const __half *const Aarray[], int lda, + const __half *const Barray[], int ldb, const __half *beta, + __half *const Carray[], int ldc, int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -42841,55 +43071,61 @@ cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasHgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasHgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const __half *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) || - rpc_write(0, &A, sizeof(const __half *)) < 0 || + rpc_write(0, &Aarray, sizeof(const __half *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const __half *)) < 0 || + rpc_write(0, &Barray, sizeof(const __half *const)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const __half *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) || - rpc_write(0, C, sizeof(__half)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(__half)) < 0 || + rpc_write(0, &Carray, sizeof(__half *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -42904,38 +43140,52 @@ cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasHgemmStridedBatched_64( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int64_t m, int64_t n, int64_t k, const __half *alpha, const __half *A, - int64_t lda, long long int strideA, const __half *B, int64_t ldb, - long long int strideB, const __half *beta, __half *C, int64_t ldc, - long long int strideC, int64_t batchCount) { +cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int64_t m, + int64_t n, int64_t k, const __half *alpha, + const __half *const Aarray[], int64_t lda, + const __half *const Barray[], int64_t ldb, + const __half *beta, __half *const Carray[], + int64_t ldc, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -42948,33 +43198,43 @@ cublasStatus_t cublasHgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasHgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasHgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -42982,22 +43242,18 @@ cublasStatus_t cublasHgemmStridedBatched_64( rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const __half *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) || - rpc_write(0, &A, sizeof(const __half *)) < 0 || + rpc_write(0, &Aarray, sizeof(const __half *const)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const __half *)) < 0 || + rpc_write(0, &Barray, sizeof(const __half *const)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const __half *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) || - rpc_write(0, C, sizeof(__half)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(__half)) < 0 || + rpc_write(0, &Carray, sizeof(__half *const)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43012,39 +43268,50 @@ cublasStatus_t cublasHgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } cublasStatus_t -cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const float *alpha, const float *A, int lda, - long long int strideA, const float *B, int ldb, - long long int strideB, const float *beta, float *C, - int ldc, long long int strideC, int batchCount) { +cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float *alpha, const float *const Aarray[], int lda, + const float *const Barray[], int ldb, const float *beta, + float *const Carray[], int ldc, int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43057,55 +43324,61 @@ cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasSgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const float *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || - rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &Aarray, sizeof(const float *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const float *)) < 0 || + rpc_write(0, &Barray, sizeof(const float *const)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const float *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || - rpc_write(0, C, sizeof(float)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 || + rpc_write(0, &Carray, sizeof(float *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43120,38 +43393,52 @@ cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasSgemmStridedBatched_64( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int64_t m, int64_t n, int64_t k, const float *alpha, const float *A, - int64_t lda, long long int strideA, const float *B, int64_t ldb, - long long int strideB, const float *beta, float *C, int64_t ldc, - long long int strideC, int64_t batchCount) { +cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int64_t m, + int64_t n, int64_t k, const float *alpha, + const float *const Aarray[], int64_t lda, + const float *const Barray[], int64_t ldb, + const float *beta, float *const Carray[], + int64_t ldc, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43164,33 +43451,43 @@ cublasStatus_t cublasSgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasSgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -43198,22 +43495,18 @@ cublasStatus_t cublasSgemmStridedBatched_64( rpc_write(0, &n, sizeof(int64_t)) < 0 || rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const float *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || - rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &Aarray, sizeof(const float *const)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const float *)) < 0 || + rpc_write(0, &Barray, sizeof(const float *const)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const float *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || - rpc_write(0, C, sizeof(float)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 || + rpc_write(0, &Carray, sizeof(float *const)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43228,39 +43521,50 @@ cublasStatus_t cublasSgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } cublasStatus_t -cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const double *alpha, const double *A, int lda, - long long int strideA, const double *B, int ldb, - long long int strideB, const double *beta, double *C, - int ldc, long long int strideC, int batchCount) { +cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const double *alpha, const double *const Aarray[], int lda, + const double *const Barray[], int ldb, const double *beta, + double *const Carray[], int ldc, int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43275,31 +43579,41 @@ cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -43307,21 +43621,19 @@ cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const double *)) < 0 || (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || - rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &Aarray, sizeof(const double *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const double *)) < 0 || + rpc_write(0, &Barray, sizeof(const double *const)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const double *)) < 0 || (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || - rpc_write(0, C, sizeof(double)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 || + rpc_write(0, &Carray, sizeof(double *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43336,38 +43648,52 @@ cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDgemmStridedBatched_64( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int64_t m, int64_t n, int64_t k, const double *alpha, const double *A, - int64_t lda, long long int strideA, const double *B, int64_t ldb, - long long int strideB, const double *beta, double *C, int64_t ldc, - long long int strideC, int64_t batchCount) { +cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int64_t m, + int64_t n, int64_t k, const double *alpha, + const double *const Aarray[], int64_t lda, + const double *const Barray[], int64_t ldb, + const double *beta, double *const Carray[], + int64_t ldc, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43382,31 +43708,41 @@ cublasStatus_t cublasDgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasDgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -43415,21 +43751,19 @@ cublasStatus_t cublasDgemmStridedBatched_64( rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const double *)) < 0 || (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || - rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &Aarray, sizeof(const double *const)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const double *)) < 0 || + rpc_write(0, &Barray, sizeof(const double *const)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const double *)) < 0 || (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || - rpc_write(0, C, sizeof(double)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 || + rpc_write(0, &Carray, sizeof(double *const)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43444,38 +43778,51 @@ cublasStatus_t cublasDgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgemmStridedBatched( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const cuComplex *alpha, const cuComplex *A, int lda, - long long int strideA, const cuComplex *B, int ldb, long long int strideB, - const cuComplex *beta, cuComplex *C, int ldc, long long int strideC, - int batchCount) { +cublasStatus_t +cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const cuComplex *alpha, const cuComplex *const Aarray[], + int lda, const cuComplex *const Barray[], int ldb, + const cuComplex *beta, cuComplex *const Carray[], int ldc, + int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43490,31 +43837,41 @@ cublasStatus_t cublasCgemmStridedBatched( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasCgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -43522,21 +43879,19 @@ cublasStatus_t cublasCgemmStridedBatched( rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &Barray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || - rpc_write(0, C, sizeof(cuComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43551,38 +43906,51 @@ cublasStatus_t cublasCgemmStridedBatched( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgemmStridedBatched_64( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int64_t m, int64_t n, int64_t k, const cuComplex *alpha, const cuComplex *A, - int64_t lda, long long int strideA, const cuComplex *B, int64_t ldb, - long long int strideB, const cuComplex *beta, cuComplex *C, int64_t ldc, - long long int strideC, int64_t batchCount) { +cublasStatus_t +cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int64_t m, int64_t n, int64_t k, + const cuComplex *alpha, const cuComplex *const Aarray[], + int64_t lda, const cuComplex *const Barray[], int64_t ldb, + const cuComplex *beta, cuComplex *const Carray[], + int64_t ldc, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43597,31 +43965,41 @@ cublasStatus_t cublasCgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasCgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -43630,21 +44008,19 @@ cublasStatus_t cublasCgemmStridedBatched_64( rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &Barray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || - rpc_write(0, C, sizeof(cuComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43659,38 +44035,51 @@ cublasStatus_t cublasCgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgemm3mStridedBatched( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const cuComplex *alpha, const cuComplex *A, int lda, - long long int strideA, const cuComplex *B, int ldb, long long int strideB, - const cuComplex *beta, cuComplex *C, int ldc, long long int strideC, - int batchCount) { +cublasStatus_t +cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const cuComplex *alpha, const cuComplex *const Aarray[], + int lda, const cuComplex *const Barray[], int ldb, + const cuComplex *beta, cuComplex *const Carray[], int ldc, + int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43705,31 +44094,41 @@ cublasStatus_t cublasCgemm3mStridedBatched( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasCgemm3mBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -43737,21 +44136,19 @@ cublasStatus_t cublasCgemm3mStridedBatched( rpc_write(0, &k, sizeof(int)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &Barray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || - rpc_write(0, C, sizeof(cuComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43766,38 +44163,50 @@ cublasStatus_t cublasCgemm3mStridedBatched( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgemm3mStridedBatched_64( +cublasStatus_t cublasCgemm3mBatched_64( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int64_t m, int64_t n, int64_t k, const cuComplex *alpha, const cuComplex *A, - int64_t lda, long long int strideA, const cuComplex *B, int64_t ldb, - long long int strideB, const cuComplex *beta, cuComplex *C, int64_t ldc, - long long int strideC, int64_t batchCount) { + int64_t m, int64_t n, int64_t k, const cuComplex *alpha, + const cuComplex *const Aarray[], int64_t lda, + const cuComplex *const Barray[], int64_t ldb, const cuComplex *beta, + cuComplex *const Carray[], int64_t ldc, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43812,31 +44221,41 @@ cublasStatus_t cublasCgemm3mStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasCgemm3mBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -43845,21 +44264,19 @@ cublasStatus_t cublasCgemm3mStridedBatched_64( rpc_write(0, &k, sizeof(int64_t)) < 0 || rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &Barray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || - rpc_write(0, C, sizeof(cuComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || - rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43874,38 +44291,50 @@ cublasStatus_t cublasCgemm3mStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZgemmStridedBatched( +cublasStatus_t cublasZgemmBatched( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, - int lda, long long int strideA, const cuDoubleComplex *B, int ldb, - long long int strideB, const cuDoubleComplex *beta, cuDoubleComplex *C, - int ldc, long long int strideC, int batchCount) { + int m, int n, int k, const cuDoubleComplex *alpha, + const cuDoubleComplex *const Aarray[], int lda, + const cuDoubleComplex *const Barray[], int ldb, const cuDoubleComplex *beta, + cuDoubleComplex *const Carray[], int ldc, int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -43920,31 +44349,41 @@ cublasStatus_t cublasZgemmStridedBatched( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZgemmStridedBatched) < 0 || + if (rpc_start_request(0, RPC_cublasZgemmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -43953,23 +44392,20 @@ cublasStatus_t cublasZgemmStridedBatched( rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &Aarray, sizeof(const cuDoubleComplex *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &Barray, sizeof(const cuDoubleComplex *const)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 || (beta != nullptr && rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &Carray, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -43984,39 +44420,51 @@ cublasStatus_t cublasZgemmStridedBatched( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZgemmStridedBatched_64( +cublasStatus_t cublasZgemmBatched_64( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int64_t lda, long long int strideA, - const cuDoubleComplex *B, int64_t ldb, long long int strideB, - const cuDoubleComplex *beta, cuDoubleComplex *C, int64_t ldc, - long long int strideC, int64_t batchCount) { + const cuDoubleComplex *const Aarray[], int64_t lda, + const cuDoubleComplex *const Barray[], int64_t ldb, + const cuDoubleComplex *beta, cuDoubleComplex *const Carray[], int64_t ldc, + int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44031,31 +44479,41 @@ cublasStatus_t cublasZgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZgemmStridedBatched_64) < 0 || + if (rpc_start_request(0, RPC_cublasZgemmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || @@ -44065,23 +44523,20 @@ cublasStatus_t cublasZgemmStridedBatched_64( rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &Aarray, sizeof(const cuDoubleComplex *const)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &strideA, sizeof(long long int)) < 0 || - rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &Barray, sizeof(const cuDoubleComplex *const)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, &strideB, sizeof(long long int)) < 0 || rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 || (beta != nullptr && rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || - rpc_write(0, &strideC, sizeof(long long int)) < 0 || - rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &Carray, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) @@ -44096,37 +44551,48 @@ cublasStatus_t cublasZgemmStridedBatched_64( return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < - 0) - return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, - const float *alpha, const float *A, int lda, - const float *beta, const float *B, int ldb, float *C, - int ldc) { +cublasStatus_t +cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const __half *alpha, const __half *A, int lda, + long long int strideA, const __half *B, int ldb, + long long int strideB, const __half *beta, __half *C, + int ldc, long long int strideC, int batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44137,39 +44603,55 @@ cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSgeam) < 0 || + if (rpc_start_request(0, RPC_cublasHgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const float *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || - rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const __half *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) || + rpc_write(0, &A, sizeof(const __half *)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const float *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || - rpc_write(0, &B, sizeof(const float *)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const __half *)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, C, sizeof(float)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(float)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const __half *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) || + rpc_write(0, C, sizeof(__half)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) @@ -44182,30 +44664,42 @@ cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int64_t m, int64_t n, - const float *alpha, const float *A, int64_t lda, - const float *beta, const float *B, int64_t ldb, - float *C, int64_t ldc) { +cublasStatus_t cublasHgemmStridedBatched_64( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int64_t m, int64_t n, int64_t k, const __half *alpha, const __half *A, + int64_t lda, long long int strideA, const __half *B, int64_t ldb, + long long int strideB, const __half *beta, __half *C, int64_t ldc, + long long int strideC, int64_t batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44216,40 +44710,56 @@ cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSgeam_64) < 0 || + if (rpc_start_request(0, RPC_cublasHgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const float *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || - rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const __half *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) || + rpc_write(0, &A, sizeof(const __half *)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const float *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || - rpc_write(0, &B, sizeof(const float *)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const __half *)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(float)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(float)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const __half *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) || + rpc_write(0, C, sizeof(__half)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) @@ -44262,30 +44772,43 @@ cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, - const double *alpha, const double *A, int lda, - const double *beta, const double *B, int ldb, - double *C, int ldc) { +cublasStatus_t +cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float *alpha, const float *A, int lda, + long long int strideA, const float *B, int ldb, + long long int strideB, const float *beta, float *C, + int ldc, long long int strideC, int batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44296,39 +44819,55 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDgeam) < 0 || + if (rpc_start_request(0, RPC_cublasSgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const double *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || - rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(const float *)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const double *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || - rpc_write(0, &B, sizeof(const double *)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const float *)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, C, sizeof(double)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(double)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const float *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || + rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) @@ -44341,30 +44880,42 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int64_t m, int64_t n, - const double *alpha, const double *A, int64_t lda, - const double *beta, const double *B, int64_t ldb, - double *C, int64_t ldc) { +cublasStatus_t cublasSgemmStridedBatched_64( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int64_t m, int64_t n, int64_t k, const float *alpha, const float *A, + int64_t lda, long long int strideA, const float *B, int64_t ldb, + long long int strideB, const float *beta, float *C, int64_t ldc, + long long int strideC, int64_t batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44375,40 +44926,56 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDgeam_64) < 0 || + if (rpc_start_request(0, RPC_cublasSgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const double *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || - rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const float *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(const float *)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const double *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || - rpc_write(0, &B, sizeof(const double *)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const float *)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(double)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(double)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const float *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || + rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) @@ -44421,30 +44988,43 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, - const cuComplex *alpha, const cuComplex *A, int lda, - const cuComplex *beta, const cuComplex *B, int ldb, - cuComplex *C, int ldc) { +cublasStatus_t +cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const double *alpha, const double *A, int lda, + long long int strideA, const double *B, int ldb, + long long int strideB, const double *beta, double *C, + int ldc, long long int strideC, int batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44455,39 +45035,55 @@ cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgeam) < 0 || + if (rpc_start_request(0, RPC_cublasDgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const double *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(const double *)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || - rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const double *)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, C, sizeof(cuComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const double *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || + rpc_write(0, C, sizeof(double)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) @@ -44500,31 +45096,42 @@ cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int64_t m, int64_t n, - const cuComplex *alpha, const cuComplex *A, - int64_t lda, const cuComplex *beta, - const cuComplex *B, int64_t ldb, cuComplex *C, - int64_t ldc) { +cublasStatus_t cublasDgemmStridedBatched_64( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int64_t m, int64_t n, int64_t k, const double *alpha, const double *A, + int64_t lda, long long int strideA, const double *B, int64_t ldb, + long long int strideB, const double *beta, double *C, int64_t ldc, + long long int strideC, int64_t batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44535,40 +45142,56 @@ cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCgeam_64) < 0 || + if (rpc_start_request(0, RPC_cublasDgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || - (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const double *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(const double *)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || - (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || - rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const double *)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(cuComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const double *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || + rpc_write(0, C, sizeof(double)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) @@ -44581,32 +45204,42 @@ cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *beta, - const cuDoubleComplex *B, int ldb, - cuDoubleComplex *C, int ldc) { +cublasStatus_t cublasCgemmStridedBatched( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const cuComplex *alpha, const cuComplex *A, int lda, + long long int strideA, const cuComplex *B, int ldb, long long int strideB, + const cuComplex *beta, cuComplex *C, int ldc, long long int strideC, + int batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44617,41 +45250,55 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZgeam) < 0 || + if (rpc_start_request(0, RPC_cublasCgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || - (alpha != nullptr && - rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 || - (beta != nullptr && - rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || rpc_write(0, &ldb, sizeof(int)) < 0 || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) @@ -44664,32 +45311,42 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int64_t m, int64_t n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int64_t lda, - const cuDoubleComplex *beta, - const cuDoubleComplex *B, int64_t ldb, - cuDoubleComplex *C, int64_t ldc) { +cublasStatus_t cublasCgemmStridedBatched_64( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int64_t m, int64_t n, int64_t k, const cuComplex *alpha, const cuComplex *A, + int64_t lda, long long int strideA, const cuComplex *B, int64_t ldb, + long long int strideB, const cuComplex *beta, cuComplex *C, int64_t ldc, + long long int strideC, int64_t batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) @@ -44700,42 +45357,56 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZgeam_64) < 0 || + if (rpc_start_request(0, RPC_cublasCgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || - (alpha != nullptr && - rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 || - (beta != nullptr && - rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || rpc_write(0, &ldb, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) @@ -44748,317 +45419,3750 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, - int n, const float *A, int lda, const float *x, - int incx, float *C, int ldc) { +cublasStatus_t cublasCgemm3mStridedBatched( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const cuComplex *alpha, const cuComplex *A, int lda, + long long int strideA, const cuComplex *B, int ldb, long long int strideB, + const cuComplex *beta, cuComplex *C, int ldc, long long int strideC, + int batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSdgmm) < 0 || + if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const float *)) < 0 || - (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) || - rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, C, sizeof(float)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(float)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, - int64_t m, int64_t n, const float *A, int64_t lda, - const float *x, int64_t incx, float *C, - int64_t ldc) { +cublasStatus_t cublasCgemm3mStridedBatched_64( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int64_t m, int64_t n, int64_t k, const cuComplex *alpha, const cuComplex *A, + int64_t lda, long long int strideA, const cuComplex *B, int64_t ldb, + long long int strideB, const cuComplex *beta, cuComplex *C, int64_t ldc, + long long int strideC, int64_t batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasSdgmm_64) < 0 || + if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const float *)) < 0 || - (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) || - rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(float)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(float)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, - int n, const double *A, int lda, const double *x, - int incx, double *C, int ldc) { +cublasStatus_t cublasZgemmStridedBatched( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, + int lda, long long int strideA, const cuDoubleComplex *B, int ldb, + long long int strideB, const cuDoubleComplex *beta, cuDoubleComplex *C, + int ldc, long long int strideC, int batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDdgmm) < 0 || + if (rpc_start_request(0, RPC_cublasZgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &k, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || + (alpha != nullptr && + rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const double *)) < 0 || - (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) || - rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, C, sizeof(double)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(double)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 || + (beta != nullptr && + rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, - int64_t m, int64_t n, const double *A, - int64_t lda, const double *x, int64_t incx, - double *C, int64_t ldc) { +cublasStatus_t cublasZgemmStridedBatched_64( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int64_t m, int64_t n, int64_t k, const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int64_t lda, long long int strideA, + const cuDoubleComplex *B, int64_t ldb, long long int strideB, + const cuDoubleComplex *beta, cuDoubleComplex *C, int64_t ldc, + long long int strideC, int64_t batchCount) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDdgmm_64) < 0 || + if (rpc_start_request(0, RPC_cublasZgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &m, sizeof(int64_t)) < 0 || rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || + (alpha != nullptr && + rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const double *)) < 0 || - (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) || - rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(double)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(double)) < 0 || + rpc_write(0, &strideA, sizeof(long long int)) < 0 || + rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &strideB, sizeof(long long int)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 || + (beta != nullptr && + rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &strideC, sizeof(long long int)) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, - int n, const cuComplex *A, int lda, - const cuComplex *x, int incx, cuComplex *C, - int ldc) { +cublasStatus_t +cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int64_t m, int64_t n, + int64_t k, const void *alpha, const void *const Aarray[], + cudaDataType Atype, int64_t lda, + const void *const Barray[], cudaDataType Btype, + int64_t ldb, const void *beta, void *const Carray[], + cudaDataType Ctype, int64_t ldc, int64_t batchCount, + cublasComputeType_t computeType, cublasGemmAlgo_t algo) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&Atype, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&Btype, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&Ctype, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&computeType, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&algo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCdgmm) < 0 || + if (rpc_start_request(0, RPC_cublasGemmBatchedEx_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &k, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const void *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const void *)) < 0) || + rpc_write(0, &Aarray, sizeof(const void *const)) < 0 || + rpc_write(0, &Atype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &Barray, sizeof(const void *const)) < 0 || + rpc_write(0, &Btype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const void *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const void *)) < 0) || + rpc_write(0, &Carray, sizeof(void *const)) < 0 || + rpc_write(0, &Ctype, sizeof(cudaDataType)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || + rpc_write(0, &computeType, sizeof(cublasComputeType_t)) < 0 || + rpc_write(0, &algo, sizeof(cublasGemmAlgo_t)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&Atype, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&Btype, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchCount) && + is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&Ctype, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&computeType, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&algo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, + const float *alpha, const float *A, int lda, + const float *beta, const float *B, int ldb, float *C, + int ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgeam) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const float *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || + rpc_write(0, &B, sizeof(const float *)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(float)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int64_t m, int64_t n, + const float *alpha, const float *A, int64_t lda, + const float *beta, const float *B, int64_t ldb, + float *C, int64_t ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgeam_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const float *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const float *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) || + rpc_write(0, &B, sizeof(const float *)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(float)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, + const double *alpha, const double *A, int lda, + const double *beta, const double *B, int ldb, + double *C, int ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgeam) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const double *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const double *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || + rpc_write(0, &B, sizeof(const double *)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(double)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(double)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int64_t m, int64_t n, + const double *alpha, const double *A, int64_t lda, + const double *beta, const double *B, int64_t ldb, + double *C, int64_t ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgeam_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const double *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const double *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) || + rpc_write(0, &B, sizeof(const double *)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(double)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(double)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, + const cuComplex *alpha, const cuComplex *A, int lda, + const cuComplex *beta, const cuComplex *B, int ldb, + cuComplex *C, int ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgeam) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int64_t m, int64_t n, + const cuComplex *alpha, const cuComplex *A, + int64_t lda, const cuComplex *beta, + const cuComplex *B, int64_t ldb, cuComplex *C, + int64_t ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgeam_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 || + (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) || + rpc_write(0, &B, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *beta, + const cuDoubleComplex *B, int ldb, + cuDoubleComplex *C, int ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgeam) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || + (alpha != nullptr && + rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 || + (beta != nullptr && + rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int64_t m, int64_t n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int64_t lda, + const cuDoubleComplex *beta, + const cuDoubleComplex *B, int64_t ldb, + cuDoubleComplex *C, int64_t ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgeam_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || + (alpha != nullptr && + rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 || + (beta != nullptr && + rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, + const float *alpha, const float *const A[], + int lda, float *const B[], int ldb, + int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasStrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const float *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(const float *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(float *const)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t +cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int64_t m, int64_t n, + const float *alpha, const float *const A[], int64_t lda, + float *const B[], int64_t ldb, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasStrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const float *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) || + rpc_write(0, &A, sizeof(const float *const)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(float *const)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, + const double *alpha, const double *const A[], + int lda, double *const B[], int ldb, + int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const double *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(const double *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(double *const)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t +cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int64_t m, int64_t n, + const double *alpha, const double *const A[], int64_t lda, + double *const B[], int64_t ldb, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const double *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) || + rpc_write(0, &A, sizeof(const double *const)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(double *const)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t +cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, const cuComplex *alpha, + const cuComplex *const A[], int lda, cuComplex *const B[], + int ldb, int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCtrsmBatched_64( + cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, + const cuComplex *alpha, const cuComplex *const A[], int64_t lda, + cuComplex *const B[], int64_t ldb, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 || + (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZtrsmBatched( + cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t trans, cublasDiagType_t diag, int m, int n, + const cuDoubleComplex *alpha, const cuDoubleComplex *const A[], int lda, + cuDoubleComplex *const B[], int ldb, int batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZtrsmBatched) < 0 || + rpc_write(0, &batchCount, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || + (alpha != nullptr && + rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &B, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZtrsmBatched_64( + cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, + const cuDoubleComplex *alpha, const cuDoubleComplex *const A[], int64_t lda, + cuDoubleComplex *const B[], int64_t ldb, int64_t batchCount) { + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZtrsmBatched_64) < 0 || + rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 || + (alpha != nullptr && + rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &A, sizeof(const cuDoubleComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &B, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)A); + i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchCount) && is_unified_pointer(0, (void *)B); + i++) + if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, + int n, const float *A, int lda, const float *x, + int incx, float *C, int ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSdgmm) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &x, sizeof(const float *)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) || + rpc_write(0, &incx, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(float)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, + int64_t m, int64_t n, const float *A, int64_t lda, + const float *x, int64_t incx, float *C, + int64_t ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSdgmm_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &A, sizeof(const float *)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &x, sizeof(const float *)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) || + rpc_write(0, &incx, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(float)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(float)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, + int n, const double *A, int lda, const double *x, + int incx, double *C, int ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDdgmm) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &x, sizeof(const double *)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) || + rpc_write(0, &incx, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(double)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(double)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, + int64_t m, int64_t n, const double *A, + int64_t lda, const double *x, int64_t incx, + double *C, int64_t ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDdgmm_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &A, sizeof(const double *)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &x, sizeof(const double *)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) || + rpc_write(0, &incx, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(double)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(double)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, + int n, const cuComplex *A, int lda, + const cuComplex *x, int incx, cuComplex *C, + int ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCdgmm) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &x, sizeof(const cuComplex *)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) || + rpc_write(0, &incx, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, + int64_t m, int64_t n, const cuComplex *A, + int64_t lda, const cuComplex *x, int64_t incx, + cuComplex *C, int64_t ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCdgmm_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &x, sizeof(const cuComplex *)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) || + rpc_write(0, &incx, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, + int n, const cuDoubleComplex *A, int lda, + const cuDoubleComplex *x, int incx, + cuDoubleComplex *C, int ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZdgmm) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &x, sizeof(const cuDoubleComplex *)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &incx, sizeof(int)) < 0 || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, + int64_t m, int64_t n, const cuDoubleComplex *A, + int64_t lda, const cuDoubleComplex *x, + int64_t incx, cuDoubleComplex *C, int64_t ldc) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZdgmm_64) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_write(0, &m, sizeof(int64_t)) < 0 || + rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &lda, sizeof(int64_t)) < 0 || + rpc_write(0, &x, sizeof(const cuDoubleComplex *)) < 0 || + (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &incx, sizeof(int64_t)) < 0 || + rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, + const float *const A[], int lda, + float *const Ainv[], int lda_inv, int *info, + int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Ainv); + i++) + if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const float *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(float *const)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Ainv); + i++) + if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, + const double *const A[], int lda, + double *const Ainv[], int lda_inv, + int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Ainv); + i++) + if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const double *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(double *const)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Ainv); + i++) + if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, + const cuComplex *const A[], int lda, + cuComplex *const Ainv[], int lda_inv, + int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Ainv); + i++) + if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const cuComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Ainv); + i++) + if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, + const cuDoubleComplex *const A[], int lda, + cuDoubleComplex *const Ainv[], int lda_inv, + int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Ainv); + i++) + if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZmatinvBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const cuDoubleComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Ainv, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &lda_inv, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Ainv); + i++) + if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, + float *const Aarray[], int lda, + float *const TauArray[], int *info, + int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchSize) && + is_unified_pointer(0, (void *)TauArray); + i++) + if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(float *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(float *const)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchSize) && + is_unified_pointer(0, (void *)TauArray); + i++) + if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, + double *const Aarray[], int lda, + double *const TauArray[], int *info, + int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchSize) && + is_unified_pointer(0, (void *)TauArray); + i++) + if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(double *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(double *const)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchSize) && + is_unified_pointer(0, (void *)TauArray); + i++) + if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, + cuComplex *const Aarray[], int lda, + cuComplex *const TauArray[], int *info, + int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchSize) && + is_unified_pointer(0, (void *)TauArray); + i++) + if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchSize) && + is_unified_pointer(0, (void *)TauArray); + i++) + if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, + cuDoubleComplex *const Aarray[], int lda, + cuDoubleComplex *const TauArray[], int *info, + int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchSize) && + is_unified_pointer(0, (void *)TauArray); + i++) + if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgeqrfBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &TauArray, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; i < static_cast(batchSize) && + is_unified_pointer(0, (void *)TauArray); + i++) + if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, int m, int n, + int nrhs, float *const Aarray[], int lda, + float *const Carray[], int ldc, int *info, + int *devInfoArray, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasSgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(float *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(float *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, int m, int n, + int nrhs, double *const Aarray[], int lda, + double *const Carray[], int ldc, int *info, + int *devInfoArray, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(double *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(double *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, int m, int n, + int nrhs, cuComplex *const Aarray[], int lda, + cuComplex *const Carray[], int ldc, int *info, + int *devInfoArray, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, int m, int n, + int nrhs, cuDoubleComplex *const Aarray[], + int lda, cuDoubleComplex *const Carray[], + int ldc, int *info, int *devInfoArray, + int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZgelsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &Carray, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || + rpc_write(0, devInfoArray, sizeof(int)) < 0 || + rpc_wait_for_response(0) < 0 || rpc_read(0, info, sizeof(int)) < 0 || + rpc_read(0, devInfoArray, sizeof(int)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Carray); + i++) + if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, + const float *AP, float *A, int lda) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasStpttr) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &AP, sizeof(const float *)) < 0 || + (AP != nullptr && rpc_write(0, AP, sizeof(const float)) < 0) || + rpc_write(0, A, sizeof(float)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, A, sizeof(float)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, + const double *AP, double *A, int lda) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasDtpttr) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &AP, sizeof(const double *)) < 0 || + (AP != nullptr && rpc_write(0, AP, sizeof(const double)) < 0) || + rpc_write(0, A, sizeof(double)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, A, sizeof(double)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, + const cuComplex *AP, cuComplex *A, int lda) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasCtpttr) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &AP, sizeof(const cuComplex *)) < 0 || + (AP != nullptr && rpc_write(0, AP, sizeof(const cuComplex)) < 0) || + rpc_write(0, A, sizeof(cuComplex)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, A, sizeof(cuComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, + const cuDoubleComplex *AP, cuDoubleComplex *A, + int lda) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasZtpttr) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &AP, sizeof(const cuDoubleComplex *)) < 0 || + (AP != nullptr && rpc_write(0, AP, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(0, &return_value) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + return return_value; +} + +cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, + const float *A, int lda, float *AP) { + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + cublasStatus_t return_value; + if (rpc_start_request(0, RPC_cublasStrttp) < 0 || + rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const float *)) < 0 || + (A != nullptr && rpc_write(0, A, sizeof(const float)) < 0) || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const cuComplex *)) < 0 || - (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) || - rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, C, sizeof(cuComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, AP, sizeof(float)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, AP, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; @@ -45066,62 +49170,40 @@ cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, - int64_t m, int64_t n, const cuComplex *A, - int64_t lda, const cuComplex *x, int64_t incx, - cuComplex *C, int64_t ldc) { +cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, + const double *A, int lda, double *AP) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCdgmm_64) < 0 || + if (rpc_start_request(0, RPC_cublasDtrttp) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_write(0, &m, sizeof(int64_t)) < 0 || - rpc_write(0, &n, sizeof(int64_t)) < 0 || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || - rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const cuComplex *)) < 0 || - (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) || - rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(cuComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuComplex)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const double *)) < 0 || + (A != nullptr && rpc_write(0, A, sizeof(const double)) < 0) || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, AP, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; @@ -45129,61 +49211,40 @@ cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, - int n, const cuDoubleComplex *A, int lda, - const cuDoubleComplex *x, int incx, - cuDoubleComplex *C, int ldc) { +cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, + const cuComplex *A, int lda, cuComplex *AP) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZdgmm) < 0 || + if (rpc_start_request(0, RPC_cublasCtrttp) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || + rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || + (A != nullptr && rpc_write(0, A, sizeof(const cuComplex)) < 0) || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, &x, sizeof(const cuDoubleComplex *)) < 0 || - (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, &incx, sizeof(int)) < 0 || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, AP, sizeof(cuComplex)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, AP, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; @@ -45191,62 +49252,42 @@ cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, - int64_t m, int64_t n, const cuDoubleComplex *A, - int64_t lda, const cuDoubleComplex *x, - int64_t incx, cuDoubleComplex *C, int64_t ldc) { +cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, + const cuDoubleComplex *A, int lda, + cuDoubleComplex *AP) { if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZdgmm_64) < 0 || + if (rpc_start_request(0, RPC_cublasZtrttp) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_write(0, &m, sizeof(int64_t)) < 0 || - rpc_write(0, &n, sizeof(int64_t)) < 0 || + rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &n, sizeof(int)) < 0 || rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || - rpc_write(0, &lda, sizeof(int64_t)) < 0 || - rpc_write(0, &x, sizeof(const cuDoubleComplex *)) < 0 || - (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, &incx, sizeof(int64_t)) < 0 || - rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 || - rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || + (A != nullptr && rpc_write(0, A, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; @@ -45254,344 +49295,676 @@ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, - const float *AP, float *A, int lda) { - if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) +cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, + const float *const A[], int lda, + const int *P, float *const C[], int ldc, + int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)C); i++) + if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasStpttr) < 0 || + if (rpc_start_request(0, RPC_cublasSgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &AP, sizeof(const float *)) < 0 || - (AP != nullptr && rpc_write(0, AP, sizeof(const float)) < 0) || - rpc_write(0, A, sizeof(float)) < 0 || - rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, A, sizeof(float)) < 0 || + rpc_write(0, &A, sizeof(const float *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int *)) < 0 || + (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) || + rpc_write(0, &C, sizeof(float *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)C); i++) + if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, - const double *AP, double *A, int lda) { - if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) +cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, + const double *const A[], int lda, + const int *P, double *const C[], int ldc, + int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)C); i++) + if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDtpttr) < 0 || + if (rpc_start_request(0, RPC_cublasDgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &AP, sizeof(const double *)) < 0 || - (AP != nullptr && rpc_write(0, AP, sizeof(const double)) < 0) || - rpc_write(0, A, sizeof(double)) < 0 || - rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, A, sizeof(double)) < 0 || + rpc_write(0, &A, sizeof(const double *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int *)) < 0 || + (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) || + rpc_write(0, &C, sizeof(double *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)C); i++) + if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; -} - -cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, - const cuComplex *AP, cuComplex *A, int lda) { - if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) +} + +cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, + const cuComplex *const A[], int lda, + const int *P, cuComplex *const C[], int ldc, + int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)C); i++) + if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCtpttr) < 0 || + if (rpc_start_request(0, RPC_cublasCgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &AP, sizeof(const cuComplex *)) < 0 || - (AP != nullptr && rpc_write(0, AP, sizeof(const cuComplex)) < 0) || - rpc_write(0, A, sizeof(cuComplex)) < 0 || - rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, A, sizeof(cuComplex)) < 0 || + rpc_write(0, &A, sizeof(const cuComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int *)) < 0 || + (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) || + rpc_write(0, &C, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)C); i++) + if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, - const cuDoubleComplex *AP, cuDoubleComplex *A, - int lda) { - if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) +cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, + const cuDoubleComplex *const A[], int lda, + const int *P, cuDoubleComplex *const C[], + int ldc, int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)C); i++) + if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZtpttr) < 0 || + if (rpc_start_request(0, RPC_cublasZgetriBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &AP, sizeof(const cuDoubleComplex *)) < 0 || - (AP != nullptr && rpc_write(0, AP, sizeof(const cuDoubleComplex)) < 0) || - rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 || - rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &A, sizeof(const cuDoubleComplex *const)) < 0 || + rpc_write(0, &lda, sizeof(int)) < 0 || + rpc_write(0, &P, sizeof(const int *)) < 0 || + (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) || + rpc_write(0, &C, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &ldc, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) - return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)A); i++) + if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)C); i++) + if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, - const float *A, int lda, float *AP) { +cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, int n, int nrhs, + const float *const Aarray[], int lda, + const int *devIpiv, float *const Barray[], + int ldb, int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasStrttp) < 0 || + if (rpc_start_request(0, RPC_cublasSgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const float *)) < 0 || - (A != nullptr && rpc_write(0, A, sizeof(const float)) < 0) || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(const float *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, AP, sizeof(float)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, AP, sizeof(float)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int *)) < 0 || + (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) || + rpc_write(0, &Barray, sizeof(float *const)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, - const double *A, int lda, double *AP) { +cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, int n, int nrhs, + const double *const Aarray[], int lda, + const int *devIpiv, double *const Barray[], + int ldb, int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasDtrttp) < 0 || + if (rpc_start_request(0, RPC_cublasDgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const double *)) < 0 || - (A != nullptr && rpc_write(0, A, sizeof(const double)) < 0) || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(const double *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, AP, sizeof(double)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int *)) < 0 || + (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) || + rpc_write(0, &Barray, sizeof(double *const)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, - const cuComplex *A, int lda, cuComplex *AP) { +cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, int n, int nrhs, + const cuComplex *const Aarray[], int lda, + const int *devIpiv, + cuComplex *const Barray[], int ldb, + int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasCtrttp) < 0 || + if (rpc_start_request(0, RPC_cublasCgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const cuComplex *)) < 0 || - (A != nullptr && rpc_write(0, A, sizeof(const cuComplex)) < 0) || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, AP, sizeof(cuComplex)) < 0 || rpc_wait_for_response(0) < 0 || - rpc_read(0, AP, sizeof(cuComplex)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int *)) < 0 || + rpc_write(0, &Barray, sizeof(cuComplex *const)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } -cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, - const cuDoubleComplex *A, int lda, - cuDoubleComplex *AP) { +cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, int n, int nrhs, + const cuDoubleComplex *const Aarray[], + int lda, const int *devIpiv, + cuDoubleComplex *const Barray[], int ldb, + int *info, int batchSize) { + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0) + if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; cublasStatus_t return_value; - if (rpc_start_request(0, RPC_cublasZtrttp) < 0 || + if (rpc_start_request(0, RPC_cublasZgetrsBatched) < 0 || + rpc_write(0, &batchSize, sizeof(int)) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 || - rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 || - (A != nullptr && rpc_write(0, A, sizeof(const cuDoubleComplex)) < 0) || + rpc_write(0, &nrhs, sizeof(int)) < 0 || + rpc_write(0, &Aarray, sizeof(const cuDoubleComplex *const)) < 0 || rpc_write(0, &lda, sizeof(int)) < 0 || - rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 || + rpc_write(0, &devIpiv, sizeof(const int *)) < 0 || + (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) || + rpc_write(0, &Barray, sizeof(cuDoubleComplex *const)) < 0 || + rpc_write(0, &ldb, sizeof(int)) < 0 || + rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || + rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Aarray); + i++) + if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0) + if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + for (int i = 0; + i < static_cast(batchSize) && is_unified_pointer(0, (void *)Barray); + i++) + if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) < + 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0) + return CUBLAS_STATUS_NOT_INITIALIZED; + if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; return return_value; } @@ -49363,6 +53736,8 @@ std::unordered_map functionMap = { {"cublasChpr2_v2_64", (void *)cublasChpr2_v2_64}, {"cublasZhpr2_v2", (void *)cublasZhpr2_v2}, {"cublasZhpr2_v2_64", (void *)cublasZhpr2_v2_64}, + {"cublasSgemvBatched", (void *)cublasSgemvBatched}, + {"cublasTSTgemvBatched", (void *)cublasTSTgemvBatched}, {"cublasSgemvStridedBatched", (void *)cublasSgemvStridedBatched}, {"cublasSgemvStridedBatched_64", (void *)cublasSgemvStridedBatched_64}, {"cublasDgemvStridedBatched", (void *)cublasDgemvStridedBatched}, @@ -49457,6 +53832,18 @@ std::unordered_map functionMap = { {"cublasCtrmm_v2_64", (void *)cublasCtrmm_v2_64}, {"cublasZtrmm_v2", (void *)cublasZtrmm_v2}, {"cublasZtrmm_v2_64", (void *)cublasZtrmm_v2_64}, + {"cublasHgemmBatched", (void *)cublasHgemmBatched}, + {"cublasHgemmBatched_64", (void *)cublasHgemmBatched_64}, + {"cublasSgemmBatched", (void *)cublasSgemmBatched}, + {"cublasSgemmBatched_64", (void *)cublasSgemmBatched_64}, + {"cublasDgemmBatched", (void *)cublasDgemmBatched}, + {"cublasDgemmBatched_64", (void *)cublasDgemmBatched_64}, + {"cublasCgemmBatched", (void *)cublasCgemmBatched}, + {"cublasCgemmBatched_64", (void *)cublasCgemmBatched_64}, + {"cublasCgemm3mBatched", (void *)cublasCgemm3mBatched}, + {"cublasCgemm3mBatched_64", (void *)cublasCgemm3mBatched_64}, + {"cublasZgemmBatched", (void *)cublasZgemmBatched}, + {"cublasZgemmBatched_64", (void *)cublasZgemmBatched_64}, {"cublasHgemmStridedBatched", (void *)cublasHgemmStridedBatched}, {"cublasHgemmStridedBatched_64", (void *)cublasHgemmStridedBatched_64}, {"cublasSgemmStridedBatched", (void *)cublasSgemmStridedBatched}, @@ -49469,6 +53856,7 @@ std::unordered_map functionMap = { {"cublasCgemm3mStridedBatched_64", (void *)cublasCgemm3mStridedBatched_64}, {"cublasZgemmStridedBatched", (void *)cublasZgemmStridedBatched}, {"cublasZgemmStridedBatched_64", (void *)cublasZgemmStridedBatched_64}, + {"cublasGemmBatchedEx_64", (void *)cublasGemmBatchedEx_64}, {"cublasSgeam", (void *)cublasSgeam}, {"cublasSgeam_64", (void *)cublasSgeam_64}, {"cublasDgeam", (void *)cublasDgeam}, @@ -49477,6 +53865,14 @@ std::unordered_map functionMap = { {"cublasCgeam_64", (void *)cublasCgeam_64}, {"cublasZgeam", (void *)cublasZgeam}, {"cublasZgeam_64", (void *)cublasZgeam_64}, + {"cublasStrsmBatched", (void *)cublasStrsmBatched}, + {"cublasStrsmBatched_64", (void *)cublasStrsmBatched_64}, + {"cublasDtrsmBatched", (void *)cublasDtrsmBatched}, + {"cublasDtrsmBatched_64", (void *)cublasDtrsmBatched_64}, + {"cublasCtrsmBatched", (void *)cublasCtrsmBatched}, + {"cublasCtrsmBatched_64", (void *)cublasCtrsmBatched_64}, + {"cublasZtrsmBatched", (void *)cublasZtrsmBatched}, + {"cublasZtrsmBatched_64", (void *)cublasZtrsmBatched_64}, {"cublasSdgmm", (void *)cublasSdgmm}, {"cublasSdgmm_64", (void *)cublasSdgmm_64}, {"cublasDdgmm", (void *)cublasDdgmm}, @@ -49485,6 +53881,18 @@ std::unordered_map functionMap = { {"cublasCdgmm_64", (void *)cublasCdgmm_64}, {"cublasZdgmm", (void *)cublasZdgmm}, {"cublasZdgmm_64", (void *)cublasZdgmm_64}, + {"cublasSmatinvBatched", (void *)cublasSmatinvBatched}, + {"cublasDmatinvBatched", (void *)cublasDmatinvBatched}, + {"cublasCmatinvBatched", (void *)cublasCmatinvBatched}, + {"cublasZmatinvBatched", (void *)cublasZmatinvBatched}, + {"cublasSgeqrfBatched", (void *)cublasSgeqrfBatched}, + {"cublasDgeqrfBatched", (void *)cublasDgeqrfBatched}, + {"cublasCgeqrfBatched", (void *)cublasCgeqrfBatched}, + {"cublasZgeqrfBatched", (void *)cublasZgeqrfBatched}, + {"cublasSgelsBatched", (void *)cublasSgelsBatched}, + {"cublasDgelsBatched", (void *)cublasDgelsBatched}, + {"cublasCgelsBatched", (void *)cublasCgelsBatched}, + {"cublasZgelsBatched", (void *)cublasZgelsBatched}, {"cublasStpttr", (void *)cublasStpttr}, {"cublasDtpttr", (void *)cublasDtpttr}, {"cublasCtpttr", (void *)cublasCtpttr}, @@ -49493,6 +53901,14 @@ std::unordered_map functionMap = { {"cublasDtrttp", (void *)cublasDtrttp}, {"cublasCtrttp", (void *)cublasCtrttp}, {"cublasZtrttp", (void *)cublasZtrttp}, + {"cublasSgetriBatched", (void *)cublasSgetriBatched}, + {"cublasDgetriBatched", (void *)cublasDgetriBatched}, + {"cublasCgetriBatched", (void *)cublasCgetriBatched}, + {"cublasZgetriBatched", (void *)cublasZgetriBatched}, + {"cublasSgetrsBatched", (void *)cublasSgetrsBatched}, + {"cublasDgetrsBatched", (void *)cublasDgetrsBatched}, + {"cublasCgetrsBatched", (void *)cublasCgetrsBatched}, + {"cublasZgetrsBatched", (void *)cublasZgetrsBatched}, {"cublasUint8gemmBias", (void *)cublasUint8gemmBias}, {"cudnnGetProperty", (void *)cudnnGetProperty}, {"cudnnCreate", (void *)cudnnCreate}, diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp index 6d24ab5..d42ef85 100644 --- a/codegen/gen_server.cpp +++ b/codegen/gen_server.cpp @@ -31601,6 +31601,100 @@ int handle_cublasZhpr2_v2_64(void *conn) { return -1; } +int handle_cublasSgemvBatched(void *conn) { + int batchCount; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + const float *alpha; + const float **Aarray = nullptr; + int lda; + const float **xarray = nullptr; + int incx; + const float *beta; + float **yarray = nullptr; + int incy; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const float *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &xarray, sizeof(const float *const *)) < 0 || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &beta, sizeof(const float *)) < 0 || + rpc_read(conn, &yarray, sizeof(float *const *)) < 0 || + rpc_read(conn, &incy, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasSgemvBatched(handle, trans, m, n, alpha, Aarray, lda, xarray, incx, + beta, yarray, incy, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasTSTgemvBatched(void *conn) { + int batchCount; + cublasHandle_t handle; + cublasOperation_t trans; + int m; + int n; + const float *alpha; + const __nv_bfloat16 **Aarray = nullptr; + int lda; + const __nv_bfloat16 **xarray = nullptr; + int incx; + const float *beta; + __nv_bfloat16 **yarray = nullptr; + int incy; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha, sizeof(const float *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const __nv_bfloat16 *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &xarray, sizeof(const __nv_bfloat16 *const *)) < 0 || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &beta, sizeof(const float *)) < 0 || + rpc_read(conn, &yarray, sizeof(__nv_bfloat16 *const *)) < 0 || + rpc_read(conn, &incy, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasTSTgemvBatched(handle, trans, m, n, alpha, Aarray, lda, xarray, + incx, beta, yarray, incy, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + int handle_cublasSgemvStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t trans; @@ -36114,60 +36208,49 @@ int handle_cublasZtrmm_v2_64(void *conn) { return -1; } -int handle_cublasHgemmStridedBatched(void *conn) { +int handle_cublasHgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; int k; - __half *alpha_null_check; - __half alpha; - const __half *A; + const __half *alpha; + const __half **Aarray = nullptr; int lda; - long long int strideA; - const __half *B; + const __half **Barray = nullptr; int ldb; - long long int strideB; - __half *beta_null_check; - __half beta; - __half C; + const __half *beta; + __half **Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || - rpc_read(conn, &alpha_null_check, sizeof(const __half *)) < 0 || - (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) || - rpc_read(conn, &A, sizeof(const __half *)) < 0 || + rpc_read(conn, &alpha, sizeof(const __half *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const __half *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const __half *)) < 0 || + rpc_read(conn, &Barray, sizeof(const __half *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const __half *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) || - rpc_read(conn, &C, sizeof(__half)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + rpc_read(conn, &beta, sizeof(const __half *)) < 0 || + rpc_read(conn, &Carray, sizeof(__half *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasHgemmStridedBatched( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasHgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, + Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36176,60 +36259,49 @@ int handle_cublasHgemmStridedBatched(void *conn) { return -1; } -int handle_cublasHgemmStridedBatched_64(void *conn) { +int handle_cublasHgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; int64_t k; - __half *alpha_null_check; - __half alpha; - const __half *A; + const __half *alpha; + const __half **Aarray = nullptr; int64_t lda; - long long int strideA; - const __half *B; + const __half **Barray = nullptr; int64_t ldb; - long long int strideB; - __half *beta_null_check; - __half beta; - __half C; + const __half *beta; + __half **Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha_null_check, sizeof(const __half *)) < 0 || - (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) || - rpc_read(conn, &A, sizeof(const __half *)) < 0 || + rpc_read(conn, &alpha, sizeof(const __half *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const __half *const *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const __half *)) < 0 || + rpc_read(conn, &Barray, sizeof(const __half *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const __half *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) || - rpc_read(conn, &C, sizeof(__half)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &beta, sizeof(const __half *)) < 0 || + rpc_read(conn, &Carray, sizeof(__half *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasHgemmStridedBatched_64( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasHgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, + Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36238,60 +36310,49 @@ int handle_cublasHgemmStridedBatched_64(void *conn) { return -1; } -int handle_cublasSgemmStridedBatched(void *conn) { +int handle_cublasSgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; int k; - float *alpha_null_check; - float alpha; - const float *A; + const float *alpha; + const float **Aarray = nullptr; int lda; - long long int strideA; - const float *B; + const float **Barray = nullptr; int ldb; - long long int strideB; - float *beta_null_check; - float beta; - float C; + const float *beta; + float **Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || rpc_read(conn, &k, sizeof(int)) < 0 || - rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || - (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || - rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &alpha, sizeof(const float *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const float *)) < 0 || + rpc_read(conn, &Barray, sizeof(const float *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || - rpc_read(conn, &C, sizeof(float)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + rpc_read(conn, &beta, sizeof(const float *)) < 0 || + rpc_read(conn, &Carray, sizeof(float *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgemmStridedBatched( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, + Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36300,60 +36361,49 @@ int handle_cublasSgemmStridedBatched(void *conn) { return -1; } -int handle_cublasSgemmStridedBatched_64(void *conn) { +int handle_cublasSgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; int64_t k; - float *alpha_null_check; - float alpha; - const float *A; + const float *alpha; + const float **Aarray = nullptr; int64_t lda; - long long int strideA; - const float *B; + const float **Barray = nullptr; int64_t ldb; - long long int strideB; - float *beta_null_check; - float beta; - float C; + const float *beta; + float **Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || rpc_read(conn, &k, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || - (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || - rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &alpha, sizeof(const float *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float *const *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const float *)) < 0 || + rpc_read(conn, &Barray, sizeof(const float *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || - rpc_read(conn, &C, sizeof(float)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &beta, sizeof(const float *)) < 0 || + rpc_read(conn, &Carray, sizeof(float *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgemmStridedBatched_64( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasSgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda, + Barray, ldb, beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36362,7 +36412,8 @@ int handle_cublasSgemmStridedBatched_64(void *conn) { return -1; } -int handle_cublasDgemmStridedBatched(void *conn) { +int handle_cublasDgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -36371,21 +36422,18 @@ int handle_cublasDgemmStridedBatched(void *conn) { int k; double *alpha_null_check; double alpha; - const double *A; + const double **Aarray = nullptr; int lda; - long long int strideA; - const double *B; + const double **Barray = nullptr; int ldb; - long long int strideB; double *beta_null_check; double beta; - double C; + double **Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || @@ -36393,29 +36441,24 @@ int handle_cublasDgemmStridedBatched(void *conn) { rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || - rpc_read(conn, &A, sizeof(const double *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const double *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const double *)) < 0 || + rpc_read(conn, &Barray, sizeof(const double *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 || (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || - rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + rpc_read(conn, &Carray, sizeof(double *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemmStridedBatched( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasDgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, + Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36424,7 +36467,8 @@ int handle_cublasDgemmStridedBatched(void *conn) { return -1; } -int handle_cublasDgemmStridedBatched_64(void *conn) { +int handle_cublasDgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -36433,21 +36477,18 @@ int handle_cublasDgemmStridedBatched_64(void *conn) { int64_t k; double *alpha_null_check; double alpha; - const double *A; + const double **Aarray = nullptr; int64_t lda; - long long int strideA; - const double *B; + const double **Barray = nullptr; int64_t ldb; - long long int strideB; double *beta_null_check; double beta; - double C; + double **Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || @@ -36455,29 +36496,24 @@ int handle_cublasDgemmStridedBatched_64(void *conn) { rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || - rpc_read(conn, &A, sizeof(const double *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const double *const *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const double *)) < 0 || + rpc_read(conn, &Barray, sizeof(const double *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 || (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || - rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &Carray, sizeof(double *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgemmStridedBatched_64( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasDgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, + lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36486,7 +36522,8 @@ int handle_cublasDgemmStridedBatched_64(void *conn) { return -1; } -int handle_cublasCgemmStridedBatched(void *conn) { +int handle_cublasCgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -36495,21 +36532,18 @@ int handle_cublasCgemmStridedBatched(void *conn) { int k; cuComplex *alpha_null_check; cuComplex alpha; - const cuComplex *A; + const cuComplex **Aarray = nullptr; int lda; - long long int strideA; - const cuComplex *B; + const cuComplex **Barray = nullptr; int ldb; - long long int strideB; cuComplex *beta_null_check; cuComplex beta; - cuComplex C; + cuComplex **Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || @@ -36518,29 +36552,24 @@ int handle_cublasCgemmStridedBatched(void *conn) { rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemmStridedBatched( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasCgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, + Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36549,7 +36578,8 @@ int handle_cublasCgemmStridedBatched(void *conn) { return -1; } -int handle_cublasCgemmStridedBatched_64(void *conn) { +int handle_cublasCgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -36558,21 +36588,18 @@ int handle_cublasCgemmStridedBatched_64(void *conn) { int64_t k; cuComplex *alpha_null_check; cuComplex alpha; - const cuComplex *A; + const cuComplex **Aarray = nullptr; int64_t lda; - long long int strideA; - const cuComplex *B; + const cuComplex **Barray = nullptr; int64_t ldb; - long long int strideB; cuComplex *beta_null_check; cuComplex beta; - cuComplex C; + cuComplex **Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || @@ -36581,29 +36608,24 @@ int handle_cublasCgemmStridedBatched_64(void *conn) { rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemmStridedBatched_64( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasCgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, + lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36612,7 +36634,8 @@ int handle_cublasCgemmStridedBatched_64(void *conn) { return -1; } -int handle_cublasCgemm3mStridedBatched(void *conn) { +int handle_cublasCgemm3mBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -36621,21 +36644,18 @@ int handle_cublasCgemm3mStridedBatched(void *conn) { int k; cuComplex *alpha_null_check; cuComplex alpha; - const cuComplex *A; + const cuComplex **Aarray = nullptr; int lda; - long long int strideA; - const cuComplex *B; + const cuComplex **Barray = nullptr; int ldb; - long long int strideB; cuComplex *beta_null_check; cuComplex beta; - cuComplex C; + cuComplex **Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || @@ -36644,29 +36664,24 @@ int handle_cublasCgemm3mStridedBatched(void *conn) { rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemm3mStridedBatched( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasCgemm3mBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, + Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36675,7 +36690,8 @@ int handle_cublasCgemm3mStridedBatched(void *conn) { return -1; } -int handle_cublasCgemm3mStridedBatched_64(void *conn) { +int handle_cublasCgemm3mBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -36684,21 +36700,18 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) { int64_t k; cuComplex *alpha_null_check; cuComplex alpha; - const cuComplex *A; + const cuComplex **Aarray = nullptr; int64_t lda; - long long int strideA; - const cuComplex *B; + const cuComplex **Barray = nullptr; int64_t ldb; - long long int strideB; cuComplex *beta_null_check; cuComplex beta; - cuComplex C; + cuComplex **Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || @@ -36707,29 +36720,24 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) { rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgemm3mStridedBatched_64( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasCgemm3mBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, + lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36738,7 +36746,8 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) { return -1; } -int handle_cublasZgemmStridedBatched(void *conn) { +int handle_cublasZgemmBatched(void *conn) { + int batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -36747,21 +36756,18 @@ int handle_cublasZgemmStridedBatched(void *conn) { int k; cuDoubleComplex *alpha_null_check; cuDoubleComplex alpha; - const cuDoubleComplex *A; + const cuDoubleComplex **Aarray = nullptr; int lda; - long long int strideA; - const cuDoubleComplex *B; + const cuDoubleComplex **Barray = nullptr; int ldb; - long long int strideB; cuDoubleComplex *beta_null_check; cuDoubleComplex beta; - cuDoubleComplex C; + cuDoubleComplex **Carray = nullptr; int ldc; - long long int strideC; - int batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || @@ -36770,30 +36776,25 @@ int handle_cublasZgemmStridedBatched(void *conn) { rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuDoubleComplex *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 || (beta_null_check && rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + rpc_read(conn, &Carray, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgemmStridedBatched( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasZgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda, + Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36802,7 +36803,8 @@ int handle_cublasZgemmStridedBatched(void *conn) { return -1; } -int handle_cublasZgemmStridedBatched_64(void *conn) { +int handle_cublasZgemmBatched_64(void *conn) { + int64_t batchCount; cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; @@ -36811,21 +36813,18 @@ int handle_cublasZgemmStridedBatched_64(void *conn) { int64_t k; cuDoubleComplex *alpha_null_check; cuDoubleComplex alpha; - const cuDoubleComplex *A; + const cuDoubleComplex **Aarray = nullptr; int64_t lda; - long long int strideA; - const cuDoubleComplex *B; + const cuDoubleComplex **Barray = nullptr; int64_t ldb; - long long int strideB; cuDoubleComplex *beta_null_check; cuDoubleComplex beta; - cuDoubleComplex C; + cuDoubleComplex **Carray = nullptr; int64_t ldc; - long long int strideC; - int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || @@ -36834,30 +36833,25 @@ int handle_cublasZgemmStridedBatched_64(void *conn) { rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideA, sizeof(long long int)) < 0 || - rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &Barray, sizeof(const cuDoubleComplex *const *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideB, sizeof(long long int)) < 0 || rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 || (beta_null_check && rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || - rpc_read(conn, &strideC, sizeof(long long int)) < 0 || - rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &Carray, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgemmStridedBatched_64( - handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, - &beta, &C, ldc, strideC, batchCount); + scuda_intercept_result = + cublasZgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray, + lda, Barray, ldb, &beta, Carray, ldc, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36866,22 +36860,27 @@ int handle_cublasZgemmStridedBatched_64(void *conn) { return -1; } -int handle_cublasSgeam(void *conn) { +int handle_cublasHgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; - float *alpha_null_check; - float alpha; - const float *A; + int k; + __half *alpha_null_check; + __half alpha; + const __half *A; int lda; - float *beta_null_check; - float beta; - const float *B; + long long int strideA; + const __half *B; int ldb; - float C; + long long int strideB; + __half *beta_null_check; + __half beta; + __half C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || @@ -36889,26 +36888,32 @@ int handle_cublasSgeam(void *conn) { rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || - (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || - rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const __half *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) || + rpc_read(conn, &A, sizeof(const __half *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || - rpc_read(conn, &B, sizeof(const float *)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const __half *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(float)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const __half *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) || + rpc_read(conn, &C, sizeof(__half)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, &alpha, A, - lda, &beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasHgemmStridedBatched( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36917,22 +36922,27 @@ int handle_cublasSgeam(void *conn) { return -1; } -int handle_cublasSgeam_64(void *conn) { +int handle_cublasHgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; - float *alpha_null_check; - float alpha; - const float *A; + int64_t k; + __half *alpha_null_check; + __half alpha; + const __half *A; int64_t lda; - float *beta_null_check; - float beta; - const float *B; + long long int strideA; + const __half *B; int64_t ldb; - float C; + long long int strideB; + __half *beta_null_check; + __half beta; + __half C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || @@ -36940,26 +36950,32 @@ int handle_cublasSgeam_64(void *conn) { rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || - (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || - rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const __half *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) || + rpc_read(conn, &A, sizeof(const __half *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || - rpc_read(conn, &B, sizeof(const float *)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const __half *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(float)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const __half *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) || + rpc_read(conn, &C, sizeof(__half)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, &alpha, - A, lda, &beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasHgemmStridedBatched_64( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_write(conn, &C, sizeof(__half)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -36968,22 +36984,151 @@ int handle_cublasSgeam_64(void *conn) { return -1; } -int handle_cublasDgeam(void *conn) { +int handle_cublasSgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; - double *alpha_null_check; - double alpha; - const double *A; - int lda; - double *beta_null_check; - double beta; + int k; + float *alpha_null_check; + float alpha; + const float *A; + int lda; + long long int strideA; + const float *B; + int ldb; + long long int strideB; + float *beta_null_check; + float beta; + float C; + int ldc; + long long int strideC; + int batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const float *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSgemmStridedBatched( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgemmStridedBatched_64(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + float *alpha_null_check; + float alpha; + const float *A; + int64_t lda; + long long int strideA; + const float *B; + int64_t ldb; + long long int strideB; + float *beta_null_check; + float beta; + float C; + int64_t ldc; + long long int strideC; + int64_t batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const float *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSgemmStridedBatched_64( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgemmStridedBatched(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + int k; + double *alpha_null_check; + double alpha; + const double *A; + int lda; + long long int strideA; const double *B; int ldb; + long long int strideB; + double *beta_null_check; + double beta; double C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || @@ -36991,23 +37136,29 @@ int handle_cublasDgeam(void *conn) { rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || rpc_read(conn, &A, sizeof(const double *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const double *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, &alpha, A, - lda, &beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasDgemmStridedBatched( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -37019,22 +37170,27 @@ int handle_cublasDgeam(void *conn) { return -1; } -int handle_cublasDgeam_64(void *conn) { +int handle_cublasDgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; + int64_t k; double *alpha_null_check; double alpha; const double *A; int64_t lda; - double *beta_null_check; - double beta; + long long int strideA; const double *B; int64_t ldb; + long long int strideB; + double *beta_null_check; + double beta; double C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || @@ -37042,23 +37198,29 @@ int handle_cublasDgeam_64(void *conn) { rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || rpc_read(conn, &A, sizeof(const double *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const double *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, &alpha, - A, lda, &beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasDgemmStridedBatched_64( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(double)) < 0 || @@ -37070,22 +37232,27 @@ int handle_cublasDgeam_64(void *conn) { return -1; } -int handle_cublasCgeam(void *conn) { +int handle_cublasCgemmStridedBatched(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int m; int n; + int k; cuComplex *alpha_null_check; cuComplex alpha; const cuComplex *A; int lda; - cuComplex *beta_null_check; - cuComplex beta; + long long int strideA; const cuComplex *B; int ldb; + long long int strideB; + cuComplex *beta_null_check; + cuComplex beta; cuComplex C; int ldc; + long long int strideC; + int batchCount; int request_id; cublasStatus_t scuda_intercept_result; if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || @@ -37093,24 +37260,30 @@ int handle_cublasCgeam(void *conn) { rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, &alpha, A, - lda, &beta, B, ldb, &C, ldc); + scuda_intercept_result = cublasCgemmStridedBatched( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &C, sizeof(cuComplex)) < 0 || @@ -37122,22 +37295,27 @@ int handle_cublasCgeam(void *conn) { return -1; } -int handle_cublasCgeam_64(void *conn) { +int handle_cublasCgemmStridedBatched_64(void *conn) { cublasHandle_t handle; cublasOperation_t transa; cublasOperation_t transb; int64_t m; int64_t n; + int64_t k; cuComplex *alpha_null_check; cuComplex alpha; const cuComplex *A; int64_t lda; - cuComplex *beta_null_check; - cuComplex beta; + long long int strideA; const cuComplex *B; int64_t ldb; + long long int strideB; + cuComplex *beta_null_check; + cuComplex beta; cuComplex C; int64_t ldc; + long long int strideC; + int64_t batchCount; int request_id; cublasStatus_t scuda_intercept_result; if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || @@ -37145,27 +37323,1580 @@ int handle_cublasCgeam_64(void *conn) { rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int64_t)) < 0 || rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || (alpha_null_check && rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || - (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgemmStridedBatched_64( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgemm3mStridedBatched(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + int k; + cuComplex *alpha_null_check; + cuComplex alpha; + const cuComplex *A; + int lda; + long long int strideA; + const cuComplex *B; + int ldb; + long long int strideB; + cuComplex *beta_null_check; + cuComplex beta; + cuComplex C; + int ldc; + long long int strideC; + int batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgemm3mStridedBatched( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgemm3mStridedBatched_64(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + cuComplex *alpha_null_check; + cuComplex alpha; + const cuComplex *A; + int64_t lda; + long long int strideA; + const cuComplex *B; + int64_t ldb; + long long int strideB; + cuComplex *beta_null_check; + cuComplex beta; + cuComplex C; + int64_t ldc; + long long int strideC; + int64_t batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgemm3mStridedBatched_64( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgemmStridedBatched(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + int k; + cuDoubleComplex *alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex *A; + int lda; + long long int strideA; + const cuDoubleComplex *B; + int ldb; + long long int strideB; + cuDoubleComplex *beta_null_check; + cuDoubleComplex beta; + cuDoubleComplex C; + int ldc; + long long int strideC; + int batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &k, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (beta_null_check && + rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgemmStridedBatched( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgemmStridedBatched_64(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + cuDoubleComplex *alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex *A; + int64_t lda; + long long int strideA; + const cuDoubleComplex *B; + int64_t ldb; + long long int strideB; + cuDoubleComplex *beta_null_check; + cuDoubleComplex beta; + cuDoubleComplex C; + int64_t ldc; + long long int strideC; + int64_t batchCount; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideA, sizeof(long long int)) < 0 || + rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideB, sizeof(long long int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (beta_null_check && + rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &strideC, sizeof(long long int)) < 0 || + rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgemmStridedBatched_64( + handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB, + &beta, &C, ldc, strideC, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasGemmBatchedEx_64(void *conn) { + int64_t batchCount; + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + int64_t k; + void *alpha_null_check; + void *alpha; + const void **Aarray = nullptr; + cudaDataType Atype; + int64_t lda; + const void **Barray = nullptr; + cudaDataType Btype; + int64_t ldb; + void *beta_null_check; + void *beta; + void **Carray = nullptr; + cudaDataType Ctype; + int64_t ldc; + cublasComputeType_t computeType; + cublasGemmAlgo_t algo; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &k, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const void *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const void *)) < 0) || + rpc_read(conn, &Aarray, sizeof(const void *const *)) < 0 || + rpc_read(conn, &Atype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &Barray, sizeof(const void *const *)) < 0 || + rpc_read(conn, &Btype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const void *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const void *)) < 0) || + rpc_read(conn, &Carray, sizeof(void *const *)) < 0 || + rpc_read(conn, &Ctype, sizeof(cudaDataType)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || + rpc_read(conn, &computeType, sizeof(cublasComputeType_t)) < 0 || + rpc_read(conn, &algo, sizeof(cublasGemmAlgo_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasGemmBatchedEx_64( + handle, transa, transb, m, n, k, &alpha, Aarray, Atype, lda, Barray, + Btype, ldb, &beta, Carray, Ctype, ldc, batchCount, computeType, algo); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgeam(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + float *alpha_null_check; + float alpha; + const float *A; + int lda; + float *beta_null_check; + float beta; + const float *B; + int ldb; + float C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || + rpc_read(conn, &B, sizeof(const float *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, &alpha, A, + lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgeam_64(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + float *alpha_null_check; + float alpha; + const float *A; + int64_t lda; + float *beta_null_check; + float beta; + const float *B; + int64_t ldb; + float C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) || + rpc_read(conn, &B, sizeof(const float *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, &alpha, + A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgeam(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + double *alpha_null_check; + double alpha; + const double *A; + int lda; + double *beta_null_check; + double beta; + const double *B; + int ldb; + double C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(const double *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || + rpc_read(conn, &B, sizeof(const double *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, &alpha, A, + lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgeam_64(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + double *alpha_null_check; + double alpha; + const double *A; + int64_t lda; + double *beta_null_check; + double beta; + const double *B; + int64_t ldb; + double C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(const double *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) || + rpc_read(conn, &B, sizeof(const double *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, &alpha, + A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgeam(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + cuComplex *alpha_null_check; + cuComplex alpha; + const cuComplex *A; + int lda; + cuComplex *beta_null_check; + cuComplex beta; + const cuComplex *B; + int ldb; + cuComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, &alpha, A, + lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgeam_64(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + cuComplex *alpha_null_check; + cuComplex alpha; + const cuComplex *A; + int64_t lda; + cuComplex *beta_null_check; + cuComplex beta; + const cuComplex *B; + int64_t ldb; + cuComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 || + (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, &alpha, + A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgeam(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + cuDoubleComplex *alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex *A; + int lda; + cuDoubleComplex *beta_null_check; + cuDoubleComplex beta; + const cuDoubleComplex *B; + int ldb; + cuDoubleComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (beta_null_check && + rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, &alpha, A, + lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgeam_64(void *conn) { + cublasHandle_t handle; + cublasOperation_t transa; + cublasOperation_t transb; + int64_t m; + int64_t n; + cuDoubleComplex *alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex *A; + int64_t lda; + cuDoubleComplex *beta_null_check; + cuDoubleComplex beta; + const cuDoubleComplex *B; + int64_t ldb; + cuDoubleComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (beta_null_check && + rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, &alpha, + A, lda, &beta, B, ldb, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasStrsmBatched(void *conn) { + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + float *alpha_null_check; + float alpha; + const float **A = nullptr; + int lda; + float **B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(const float *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(float *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasStrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, + B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasStrsmBatched_64(void *conn) { + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + float *alpha_null_check; + float alpha; + const float **A = nullptr; + int64_t lda; + float **B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) || + rpc_read(conn, &A, sizeof(const float *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(float *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasStrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, + lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDtrsmBatched(void *conn) { + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + double *alpha_null_check; + double alpha; + const double **A = nullptr; + int lda; + double **B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(const double *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(double *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasDtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, + B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDtrsmBatched_64(void *conn) { + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + double *alpha_null_check; + double alpha; + const double **A = nullptr; + int64_t lda; + double **B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 || + (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) || + rpc_read(conn, &A, sizeof(const double *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(double *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasDtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, + lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCtrsmBatched(void *conn) { + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + cuComplex *alpha_null_check; + cuComplex alpha; + const cuComplex **A = nullptr; + int lda; + cuComplex **B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasCtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, + B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCtrsmBatched_64(void *conn) { + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + cuComplex *alpha_null_check; + cuComplex alpha; + const cuComplex **A = nullptr; + int64_t lda; + cuComplex **B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasCtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, + lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZtrsmBatched(void *conn) { + int batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int m; + int n; + cuDoubleComplex *alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex **A = nullptr; + int lda; + cuDoubleComplex **B = nullptr; + int ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &B, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasZtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda, + B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZtrsmBatched_64(void *conn) { + int64_t batchCount; + cublasHandle_t handle; + cublasSideMode_t side; + cublasFillMode_t uplo; + cublasOperation_t trans; + cublasDiagType_t diag; + int64_t m; + int64_t n; + cuDoubleComplex *alpha_null_check; + cuDoubleComplex alpha; + const cuDoubleComplex **A = nullptr; + int64_t lda; + cuDoubleComplex **B = nullptr; + int64_t ldb; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (alpha_null_check && + rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &B, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasZtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A, + lda, B, ldb, batchCount); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSdgmm(void *conn) { + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const float *A; + int lda; + float *x_null_check; + float x; + int incx; + float C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const float *)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasSdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSdgmm_64(void *conn) { + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const float *A; + int64_t lda; + float *x_null_check; + float x; + int64_t incx; + float C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const float *)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(float)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasSdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDdgmm(void *conn) { + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const double *A; + int lda; + double *x_null_check; + double x; + int incx; + double C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const double *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const double *)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasDdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDdgmm_64(void *conn) { + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const double *A; + int64_t lda; + double *x_null_check; + double x; + int64_t incx; + double C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const double *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const double *)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(double)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasDdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCdgmm(void *conn) { + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const cuComplex *A; + int lda; + cuComplex *x_null_check; + cuComplex x; + int incx; + cuComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const cuComplex *)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasCdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCdgmm_64(void *conn) { + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const cuComplex *A; + int64_t lda; + cuComplex *x_null_check; + cuComplex x; + int64_t incx; + cuComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const cuComplex *)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasCdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZdgmm(void *conn) { + cublasHandle_t handle; + cublasSideMode_t mode; + int m; + int n; + const cuDoubleComplex *A; + int lda; + cuDoubleComplex *x_null_check; + cuDoubleComplex x; + int incx; + cuDoubleComplex C; + int ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &incx, sizeof(int)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasZdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZdgmm_64(void *conn) { + cublasHandle_t handle; + cublasSideMode_t mode; + int64_t m; + int64_t n; + const cuDoubleComplex *A; + int64_t lda; + cuDoubleComplex *x_null_check; + cuDoubleComplex x; + int64_t incx; + cuDoubleComplex C; + int64_t ldc; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + rpc_read(conn, &m, sizeof(int64_t)) < 0 || + rpc_read(conn, &n, sizeof(int64_t)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &lda, sizeof(int64_t)) < 0 || + rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex *)) < 0 || + (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) || + rpc_read(conn, &incx, sizeof(int64_t)) < 0 || + rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasZdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSmatinvBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + int n; + const float **A = nullptr; + int lda; + float **Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const float *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(float *const *)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasSmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDmatinvBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + int n; + const double **A = nullptr; + int lda; + double **Ainv = nullptr; + int lda_inv; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const double *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(double *const *)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, &alpha, - A, lda, &beta, B, ldb, &C, ldc); + scuda_intercept_result = + cublasDmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37174,51 +38905,35 @@ int handle_cublasCgeam_64(void *conn) { return -1; } -int handle_cublasZgeam(void *conn) { +int handle_cublasCmatinvBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasOperation_t transa; - cublasOperation_t transb; - int m; int n; - cuDoubleComplex *alpha_null_check; - cuDoubleComplex alpha; - const cuDoubleComplex *A; + const cuComplex **A = nullptr; int lda; - cuDoubleComplex *beta_null_check; - cuDoubleComplex beta; - const cuDoubleComplex *B; - int ldb; - cuDoubleComplex C; - int ldc; + cuComplex **Ainv = nullptr; + int lda_inv; + int info; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || - rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || - rpc_read(conn, &m, sizeof(int)) < 0 || + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || - (alpha_null_check && - rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 || - (beta_null_check && - rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 || - rpc_read(conn, &ldb, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + rpc_read(conn, &Ainv, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, &alpha, A, - lda, &beta, B, ldb, &C, ldc); + scuda_intercept_result = + cublasCmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37227,51 +38942,35 @@ int handle_cublasZgeam(void *conn) { return -1; } -int handle_cublasZgeam_64(void *conn) { +int handle_cublasZmatinvBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasOperation_t transa; - cublasOperation_t transb; - int64_t m; - int64_t n; - cuDoubleComplex *alpha_null_check; - cuDoubleComplex alpha; - const cuDoubleComplex *A; - int64_t lda; - cuDoubleComplex *beta_null_check; - cuDoubleComplex beta; - const cuDoubleComplex *B; - int64_t ldb; - cuDoubleComplex C; - int64_t ldc; + int n; + const cuDoubleComplex **A = nullptr; + int lda; + cuDoubleComplex **Ainv = nullptr; + int lda_inv; + int info; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 || - rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 || - (alpha_null_check && - rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 || - (beta_null_check && - rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 || - rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Ainv, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &lda_inv, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, &alpha, - A, lda, &beta, B, ldb, &C, ldc); + scuda_intercept_result = + cublasZmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37280,41 +38979,35 @@ int handle_cublasZgeam_64(void *conn) { return -1; } -int handle_cublasSdgmm(void *conn) { +int handle_cublasSgeqrfBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasSideMode_t mode; int m; int n; - const float *A; + float **Aarray = nullptr; int lda; - float *x_null_check; - float x; - int incx; - float C; - int ldc; + float **TauArray = nullptr; + int info; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const float *)) < 0 || + rpc_read(conn, &Aarray, sizeof(float *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x_null_check, sizeof(const float *)) < 0 || - (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(float)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + rpc_read(conn, &TauArray, sizeof(float *const *)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = - cublasSdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + scuda_intercept_result = cublasSgeqrfBatched(handle, m, n, Aarray, lda, + TauArray, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37323,41 +39016,35 @@ int handle_cublasSdgmm(void *conn) { return -1; } -int handle_cublasSdgmm_64(void *conn) { +int handle_cublasDgeqrfBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const float *A; - int64_t lda; - float *x_null_check; - float x; - int64_t incx; - float C; - int64_t ldc; + int m; + int n; + double **Aarray = nullptr; + int lda; + double **TauArray = nullptr; + int info; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const float *)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x_null_check, sizeof(const float *)) < 0 || - (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(float)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(double *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(double *const *)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = - cublasSdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + scuda_intercept_result = cublasDgeqrfBatched(handle, m, n, Aarray, lda, + TauArray, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(float)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37366,41 +39053,35 @@ int handle_cublasSdgmm_64(void *conn) { return -1; } -int handle_cublasDdgmm(void *conn) { +int handle_cublasCgeqrfBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasSideMode_t mode; int m; int n; - const double *A; + cuComplex **Aarray = nullptr; int lda; - double *x_null_check; - double x; - int incx; - double C; - int ldc; + cuComplex **TauArray = nullptr; + int info; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const double *)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x_null_check, sizeof(const double *)) < 0 || - (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + rpc_read(conn, &TauArray, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = - cublasDdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + scuda_intercept_result = cublasCgeqrfBatched(handle, m, n, Aarray, lda, + TauArray, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37409,41 +39090,35 @@ int handle_cublasDdgmm(void *conn) { return -1; } -int handle_cublasDdgmm_64(void *conn) { +int handle_cublasZgeqrfBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const double *A; - int64_t lda; - double *x_null_check; - double x; - int64_t incx; - double C; - int64_t ldc; + int m; + int n; + cuDoubleComplex **Aarray = nullptr; + int lda; + cuDoubleComplex **TauArray = nullptr; + int info; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const double *)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x_null_check, sizeof(const double *)) < 0 || - (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(double)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &TauArray, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; - scuda_intercept_result = - cublasDdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + scuda_intercept_result = cublasZgeqrfBatched(handle, m, n, Aarray, lda, + TauArray, &info, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(double)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37452,41 +39127,45 @@ int handle_cublasDdgmm_64(void *conn) { return -1; } -int handle_cublasCdgmm(void *conn) { +int handle_cublasSgelsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasSideMode_t mode; + cublasOperation_t trans; int m; int n; - const cuComplex *A; + int nrhs; + float **Aarray = nullptr; int lda; - cuComplex *x_null_check; - cuComplex x; - int incx; - cuComplex C; + float **Carray = nullptr; int ldc; + int info; + int devInfoArray; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(float *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x_null_check, sizeof(const cuComplex *)) < 0 || - (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + rpc_read(conn, &Carray, sizeof(float *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; scuda_intercept_result = - cublasCdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, + &info, &devInfoArray, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37495,41 +39174,45 @@ int handle_cublasCdgmm(void *conn) { return -1; } -int handle_cublasCdgmm_64(void *conn) { +int handle_cublasDgelsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const cuComplex *A; - int64_t lda; - cuComplex *x_null_check; - cuComplex x; - int64_t incx; - cuComplex C; - int64_t ldc; + cublasOperation_t trans; + int m; + int n; + int nrhs; + double **Aarray = nullptr; + int lda; + double **Carray = nullptr; + int ldc; + int info; + int devInfoArray; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x_null_check, sizeof(const cuComplex *)) < 0 || - (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(cuComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(double *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(double *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; scuda_intercept_result = - cublasCdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, + &info, &devInfoArray, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37538,41 +39221,45 @@ int handle_cublasCdgmm_64(void *conn) { return -1; } -int handle_cublasZdgmm(void *conn) { +int handle_cublasCgelsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasSideMode_t mode; + cublasOperation_t trans; int m; - int n; - const cuDoubleComplex *A; - int lda; - cuDoubleComplex *x_null_check; - cuDoubleComplex x; - int incx; - cuDoubleComplex C; + int n; + int nrhs; + cuComplex **Aarray = nullptr; + int lda; + cuComplex **Carray = nullptr; int ldc; + int info; + int devInfoArray; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || rpc_read(conn, &m, sizeof(int)) < 0 || rpc_read(conn, &n, sizeof(int)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuComplex *const *)) < 0 || rpc_read(conn, &lda, sizeof(int)) < 0 || - rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex *)) < 0 || - (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &incx, sizeof(int)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int)) < 0 || false) + rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; scuda_intercept_result = - cublasZdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + cublasCgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, + &info, &devInfoArray, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37581,41 +39268,45 @@ int handle_cublasZdgmm(void *conn) { return -1; } -int handle_cublasZdgmm_64(void *conn) { +int handle_cublasZgelsBatched(void *conn) { + int batchSize; cublasHandle_t handle; - cublasSideMode_t mode; - int64_t m; - int64_t n; - const cuDoubleComplex *A; - int64_t lda; - cuDoubleComplex *x_null_check; - cuDoubleComplex x; - int64_t incx; - cuDoubleComplex C; - int64_t ldc; + cublasOperation_t trans; + int m; + int n; + int nrhs; + cuDoubleComplex **Aarray = nullptr; + int lda; + cuDoubleComplex **Carray = nullptr; + int ldc; + int info; + int devInfoArray; int request_id; cublasStatus_t scuda_intercept_result; - if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || - rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 || - rpc_read(conn, &m, sizeof(int64_t)) < 0 || - rpc_read(conn, &n, sizeof(int64_t)) < 0 || - rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 || - rpc_read(conn, &lda, sizeof(int64_t)) < 0 || - rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex *)) < 0 || - (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) || - rpc_read(conn, &incx, sizeof(int64_t)) < 0 || - rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 || - rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false) + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &m, sizeof(int)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &Carray, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || + rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || false) goto ERROR_0; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; scuda_intercept_result = - cublasZdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc); + cublasZgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, + &info, &devInfoArray, batchSize); if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_write(conn, &devInfoArray, sizeof(int)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) goto ERROR_0; @@ -37897,6 +39588,352 @@ int handle_cublasZtrttp(void *conn) { return -1; } +int handle_cublasSgetriBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + int n; + const float **A = nullptr; + int lda; + int *P_null_check; + int P; + float **C = nullptr; + int ldc; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const float *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P_null_check, sizeof(const int *)) < 0 || + (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) || + rpc_read(conn, &C, sizeof(float *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasSgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgetriBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + int n; + const double **A = nullptr; + int lda; + int *P_null_check; + int P; + double **C = nullptr; + int ldc; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const double *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P_null_check, sizeof(const int *)) < 0 || + (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) || + rpc_read(conn, &C, sizeof(double *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasDgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgetriBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + int n; + const cuComplex **A = nullptr; + int lda; + int *P_null_check; + int P; + cuComplex **C = nullptr; + int ldc; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P_null_check, sizeof(const int *)) < 0 || + (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) || + rpc_read(conn, &C, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasCgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgetriBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + int n; + const cuDoubleComplex **A = nullptr; + int lda; + int *P_null_check; + int P; + cuDoubleComplex **C = nullptr; + int ldc; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &A, sizeof(const cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &P_null_check, sizeof(const int *)) < 0 || + (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) || + rpc_read(conn, &C, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &ldc, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasZgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasSgetrsBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int n; + int nrhs; + const float **Aarray = nullptr; + int lda; + int *devIpiv_null_check; + int devIpiv; + float **Barray = nullptr; + int ldb; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const float *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &devIpiv_null_check, sizeof(const int *)) < 0 || + (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) || + rpc_read(conn, &Barray, sizeof(float *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasSgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray, + ldb, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasDgetrsBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int n; + int nrhs; + const double **Aarray = nullptr; + int lda; + int *devIpiv_null_check; + int devIpiv; + double **Barray = nullptr; + int ldb; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const double *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &devIpiv_null_check, sizeof(const int *)) < 0 || + (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) || + rpc_read(conn, &Barray, sizeof(double *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasDgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray, + ldb, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasCgetrsBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int n; + int nrhs; + const cuComplex **Aarray = nullptr; + int lda; + const int *devIpiv; + cuComplex **Barray = nullptr; + int ldb; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &devIpiv, sizeof(const int *)) < 0 || + rpc_read(conn, &Barray, sizeof(cuComplex *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasCgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, + ldb, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cublasZgetrsBatched(void *conn) { + int batchSize; + cublasHandle_t handle; + cublasOperation_t trans; + int n; + int nrhs; + const cuDoubleComplex **Aarray = nullptr; + int lda; + int *devIpiv_null_check; + int devIpiv; + cuDoubleComplex **Barray = nullptr; + int ldb; + int info; + int request_id; + cublasStatus_t scuda_intercept_result; + if (rpc_read(conn, &batchSize, sizeof(int)) < 0 || + rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 || + rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 || + rpc_read(conn, &n, sizeof(int)) < 0 || + rpc_read(conn, &nrhs, sizeof(int)) < 0 || + rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &lda, sizeof(int)) < 0 || + rpc_read(conn, &devIpiv_null_check, sizeof(const int *)) < 0 || + (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) || + rpc_read(conn, &Barray, sizeof(cuDoubleComplex *const *)) < 0 || + rpc_read(conn, &ldb, sizeof(int)) < 0 || + rpc_read(conn, &info, sizeof(int)) < 0 || false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = + cublasZgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray, + ldb, &info, batchSize); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &info, sizeof(int)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + int handle_cublasUint8gemmBias(void *conn) { cublasHandle_t handle; cublasOperation_t transa; @@ -41188,6 +43225,8 @@ static RequestHandler opHandlers[] = { handle_cublasChpr2_v2_64, handle_cublasZhpr2_v2, handle_cublasZhpr2_v2_64, + handle_cublasSgemvBatched, + handle_cublasTSTgemvBatched, handle_cublasSgemvStridedBatched, handle_cublasSgemvStridedBatched_64, handle_cublasDgemvStridedBatched, @@ -41282,6 +43321,18 @@ static RequestHandler opHandlers[] = { handle_cublasCtrmm_v2_64, handle_cublasZtrmm_v2, handle_cublasZtrmm_v2_64, + handle_cublasHgemmBatched, + handle_cublasHgemmBatched_64, + handle_cublasSgemmBatched, + handle_cublasSgemmBatched_64, + handle_cublasDgemmBatched, + handle_cublasDgemmBatched_64, + handle_cublasCgemmBatched, + handle_cublasCgemmBatched_64, + handle_cublasCgemm3mBatched, + handle_cublasCgemm3mBatched_64, + handle_cublasZgemmBatched, + handle_cublasZgemmBatched_64, handle_cublasHgemmStridedBatched, handle_cublasHgemmStridedBatched_64, handle_cublasSgemmStridedBatched, @@ -41295,6 +43346,7 @@ static RequestHandler opHandlers[] = { handle_cublasZgemmStridedBatched, handle_cublasZgemmStridedBatched_64, nullptr, + handle_cublasGemmBatchedEx_64, handle_cublasSgeam, handle_cublasSgeam_64, handle_cublasDgeam, @@ -41303,6 +43355,14 @@ static RequestHandler opHandlers[] = { handle_cublasCgeam_64, handle_cublasZgeam, handle_cublasZgeam_64, + handle_cublasStrsmBatched, + handle_cublasStrsmBatched_64, + handle_cublasDtrsmBatched, + handle_cublasDtrsmBatched_64, + handle_cublasCtrsmBatched, + handle_cublasCtrsmBatched_64, + handle_cublasZtrsmBatched, + handle_cublasZtrsmBatched_64, handle_cublasSdgmm, handle_cublasSdgmm_64, handle_cublasDdgmm, @@ -41311,6 +43371,18 @@ static RequestHandler opHandlers[] = { handle_cublasCdgmm_64, handle_cublasZdgmm, handle_cublasZdgmm_64, + handle_cublasSmatinvBatched, + handle_cublasDmatinvBatched, + handle_cublasCmatinvBatched, + handle_cublasZmatinvBatched, + handle_cublasSgeqrfBatched, + handle_cublasDgeqrfBatched, + handle_cublasCgeqrfBatched, + handle_cublasZgeqrfBatched, + handle_cublasSgelsBatched, + handle_cublasDgelsBatched, + handle_cublasCgelsBatched, + handle_cublasZgelsBatched, handle_cublasStpttr, handle_cublasDtpttr, handle_cublasCtpttr, @@ -41319,6 +43391,14 @@ static RequestHandler opHandlers[] = { handle_cublasDtrttp, handle_cublasCtrttp, handle_cublasZtrttp, + handle_cublasSgetriBatched, + handle_cublasDgetriBatched, + handle_cublasCgetriBatched, + handle_cublasZgetriBatched, + handle_cublasSgetrsBatched, + handle_cublasDgetrsBatched, + handle_cublasCgetrsBatched, + handle_cublasZgetrsBatched, handle_cublasUint8gemmBias, nullptr, nullptr,