diff --git a/codegen/codegen.py b/codegen/codegen.py
index 99961bd..67bac57 100644
--- a/codegen/codegen.py
+++ b/codegen/codegen.py
@@ -197,7 +197,7 @@ def client_rpc_write(self, f):
             f.write(
                 "        rpc_write(0, &{param_name}, sizeof({param_type})) < 0 ||\n".format(
                     param_name=self.parameter.name,
-                    param_type=self.parameter.name,
+                    param_type=self.ptr.array_of.format(),
                 )
             )
         else:
@@ -243,11 +243,10 @@ def client_unified_copy(self, f, direction, error):
     @property
     def server_declaration(self) -> str:
         if isinstance(self.ptr, Array):
-            c = self.ptr.const
-            self.ptr.const = False
-            # const[] isn't a valid part of a variable declaration
-            s = f"    {self.ptr.format().replace("const[]", "")}* {self.parameter.name} = nullptr;\n"
-            self.ptr.const = c
+            c = self.ptr.array_of.const
+            self.ptr.array_of.const = False
+            s = f"    {self.ptr.array_of.format()}* {self.parameter.name} = nullptr;\n"
+            self.ptr.array_of.const = c
         else:
             c = self.ptr.ptr_to.const
             self.ptr.ptr_to.const = False
@@ -281,9 +280,9 @@ def server_rpc_read(self, f, index) -> Optional[str]:
             )
         elif isinstance(self.ptr, Array):
             f.write(
-                "        rpc_read(conn, &{param_name}, sizeof({param_type})) < 0 ||\n".format(
+                "        rpc_read(conn, &{param_name}, sizeof({param_type}*)) < 0 ||\n".format(
                     param_name=self.parameter.name,
-                    param_type=self.ptr.format().replace("[]", ""),
+                    param_type=self.ptr.array_of.format(),
                 )
             )
         else:
@@ -690,7 +689,7 @@ def parse_annotation(annotation: str, params: list[Parameter]) -> list[tuple[Ope
                 ))
             elif isinstance(param.type, Array):
                 length_param = next(p for p in params if p.name == length_arg.split(":")[1])
-                if param.type.const:
+                if param.type.array_of.const:
                     recv = False
                 operations.append(ArrayOperation(
                     send=send,
diff --git a/codegen/gen_api.h b/codegen/gen_api.h
index 502f648..2545cf2 100644
--- a/codegen/gen_api.h
+++ b/codegen/gen_api.h
@@ -1156,213 +1156,256 @@
 #define RPC_cublasChpr2_v2_64 1155
 #define RPC_cublasZhpr2_v2 1156
 #define RPC_cublasZhpr2_v2_64 1157
-#define RPC_cublasSgemvStridedBatched 1158
-#define RPC_cublasSgemvStridedBatched_64 1159
-#define RPC_cublasDgemvStridedBatched 1160
-#define RPC_cublasDgemvStridedBatched_64 1161
-#define RPC_cublasCgemvStridedBatched 1162
-#define RPC_cublasCgemvStridedBatched_64 1163
-#define RPC_cublasZgemvStridedBatched 1164
-#define RPC_cublasZgemvStridedBatched_64 1165
-#define RPC_cublasHSHgemvStridedBatched 1166
-#define RPC_cublasHSHgemvStridedBatched_64 1167
-#define RPC_cublasHSSgemvStridedBatched 1168
-#define RPC_cublasHSSgemvStridedBatched_64 1169
-#define RPC_cublasTSTgemvStridedBatched 1170
-#define RPC_cublasTSTgemvStridedBatched_64 1171
-#define RPC_cublasTSSgemvStridedBatched 1172
-#define RPC_cublasTSSgemvStridedBatched_64 1173
-#define RPC_cublasSgemm_v2 1174
-#define RPC_cublasSgemm_v2_64 1175
-#define RPC_cublasDgemm_v2 1176
-#define RPC_cublasDgemm_v2_64 1177
-#define RPC_cublasCgemm_v2 1178
-#define RPC_cublasCgemm_v2_64 1179
-#define RPC_cublasCgemm3m 1180
-#define RPC_cublasCgemm3m_64 1181
-#define RPC_cublasZgemm_v2 1182
-#define RPC_cublasZgemm_v2_64 1183
-#define RPC_cublasZgemm3m 1184
-#define RPC_cublasZgemm3m_64 1185
-#define RPC_cublasHgemm 1186
-#define RPC_cublasHgemm_64 1187
-#define RPC_cublasSsyrk_v2 1188
-#define RPC_cublasSsyrk_v2_64 1189
-#define RPC_cublasDsyrk_v2 1190
-#define RPC_cublasDsyrk_v2_64 1191
-#define RPC_cublasCsyrk_v2 1192
-#define RPC_cublasCsyrk_v2_64 1193
-#define RPC_cublasZsyrk_v2 1194
-#define RPC_cublasZsyrk_v2_64 1195
-#define RPC_cublasCherk_v2 1196
-#define RPC_cublasCherk_v2_64 1197
-#define RPC_cublasZherk_v2 1198
-#define RPC_cublasZherk_v2_64 1199
-#define RPC_cublasSsyr2k_v2 1200
-#define RPC_cublasSsyr2k_v2_64 1201
-#define RPC_cublasDsyr2k_v2 1202
-#define RPC_cublasDsyr2k_v2_64 1203
-#define RPC_cublasCsyr2k_v2 1204
-#define RPC_cublasCsyr2k_v2_64 1205
-#define RPC_cublasZsyr2k_v2 1206
-#define RPC_cublasZsyr2k_v2_64 1207
-#define RPC_cublasCher2k_v2 1208
-#define RPC_cublasCher2k_v2_64 1209
-#define RPC_cublasZher2k_v2 1210
-#define RPC_cublasZher2k_v2_64 1211
-#define RPC_cublasSsyrkx 1212
-#define RPC_cublasSsyrkx_64 1213
-#define RPC_cublasDsyrkx 1214
-#define RPC_cublasDsyrkx_64 1215
-#define RPC_cublasCsyrkx 1216
-#define RPC_cublasCsyrkx_64 1217
-#define RPC_cublasZsyrkx 1218
-#define RPC_cublasZsyrkx_64 1219
-#define RPC_cublasCherkx 1220
-#define RPC_cublasCherkx_64 1221
-#define RPC_cublasZherkx 1222
-#define RPC_cublasZherkx_64 1223
-#define RPC_cublasSsymm_v2 1224
-#define RPC_cublasSsymm_v2_64 1225
-#define RPC_cublasDsymm_v2 1226
-#define RPC_cublasDsymm_v2_64 1227
-#define RPC_cublasCsymm_v2 1228
-#define RPC_cublasCsymm_v2_64 1229
-#define RPC_cublasZsymm_v2 1230
-#define RPC_cublasZsymm_v2_64 1231
-#define RPC_cublasChemm_v2 1232
-#define RPC_cublasChemm_v2_64 1233
-#define RPC_cublasZhemm_v2 1234
-#define RPC_cublasZhemm_v2_64 1235
-#define RPC_cublasStrsm_v2 1236
-#define RPC_cublasStrsm_v2_64 1237
-#define RPC_cublasDtrsm_v2 1238
-#define RPC_cublasDtrsm_v2_64 1239
-#define RPC_cublasCtrsm_v2 1240
-#define RPC_cublasCtrsm_v2_64 1241
-#define RPC_cublasZtrsm_v2 1242
-#define RPC_cublasZtrsm_v2_64 1243
-#define RPC_cublasStrmm_v2 1244
-#define RPC_cublasStrmm_v2_64 1245
-#define RPC_cublasDtrmm_v2 1246
-#define RPC_cublasDtrmm_v2_64 1247
-#define RPC_cublasCtrmm_v2 1248
-#define RPC_cublasCtrmm_v2_64 1249
-#define RPC_cublasZtrmm_v2 1250
-#define RPC_cublasZtrmm_v2_64 1251
-#define RPC_cublasHgemmStridedBatched 1252
-#define RPC_cublasHgemmStridedBatched_64 1253
-#define RPC_cublasSgemmStridedBatched 1254
-#define RPC_cublasSgemmStridedBatched_64 1255
-#define RPC_cublasDgemmStridedBatched 1256
-#define RPC_cublasDgemmStridedBatched_64 1257
-#define RPC_cublasCgemmStridedBatched 1258
-#define RPC_cublasCgemmStridedBatched_64 1259
-#define RPC_cublasCgemm3mStridedBatched 1260
-#define RPC_cublasCgemm3mStridedBatched_64 1261
-#define RPC_cublasZgemmStridedBatched 1262
-#define RPC_cublasZgemmStridedBatched_64 1263
-#define RPC_cublasGemmBatchedEx 1264
-#define RPC_cublasSgeam 1265
-#define RPC_cublasSgeam_64 1266
-#define RPC_cublasDgeam 1267
-#define RPC_cublasDgeam_64 1268
-#define RPC_cublasCgeam 1269
-#define RPC_cublasCgeam_64 1270
-#define RPC_cublasZgeam 1271
-#define RPC_cublasZgeam_64 1272
-#define RPC_cublasSdgmm 1273
-#define RPC_cublasSdgmm_64 1274
-#define RPC_cublasDdgmm 1275
-#define RPC_cublasDdgmm_64 1276
-#define RPC_cublasCdgmm 1277
-#define RPC_cublasCdgmm_64 1278
-#define RPC_cublasZdgmm 1279
-#define RPC_cublasZdgmm_64 1280
-#define RPC_cublasStpttr 1281
-#define RPC_cublasDtpttr 1282
-#define RPC_cublasCtpttr 1283
-#define RPC_cublasZtpttr 1284
-#define RPC_cublasStrttp 1285
-#define RPC_cublasDtrttp 1286
-#define RPC_cublasCtrttp 1287
-#define RPC_cublasZtrttp 1288
-#define RPC_cublasUint8gemmBias 1289
-#define RPC_cublasMigrateComputeType 1290
-#define RPC_cudnnGetVersion 1291
-#define RPC_cudnnGetMaxDeviceVersion 1292
-#define RPC_cudnnGetCudartVersion 1293
-#define RPC_cudnnGetErrorString 1294
-#define RPC_cudnnGetLastErrorString 1295
-#define RPC_cudnnQueryRuntimeError 1296
-#define RPC_cudnnGetProperty 1297
-#define RPC_cudnnCreate 1298
-#define RPC_cudnnDestroy 1299
-#define RPC_cudnnSetStream 1300
-#define RPC_cudnnGetStream 1301
-#define RPC_cudnnGetCallback 1302
-#define RPC_cudnnGraphVersionCheck 1303
-#define RPC_cudnnBackendCreateDescriptor 1304
-#define RPC_cudnnBackendDestroyDescriptor 1305
-#define RPC_cudnnBackendInitialize 1306
-#define RPC_cudnnBackendFinalize 1307
-#define RPC_cudnnBackendSetAttribute 1308
-#define RPC_cudnnBackendExecute 1309
-#define RPC_cudnnBackendPopulateCudaGraph 1310
-#define RPC_cudnnBackendUpdateCudaGraph 1311
-#define RPC_cudnnCreateTensorDescriptor 1312
-#define RPC_cudnnSetTensor4dDescriptor 1313
-#define RPC_cudnnSetTensor4dDescriptorEx 1314
-#define RPC_cudnnGetTensor4dDescriptor 1315
-#define RPC_cudnnGetTensorSizeInBytes 1316
-#define RPC_cudnnDestroyTensorDescriptor 1317
-#define RPC_cudnnInitTransformDest 1318
-#define RPC_cudnnCreateTensorTransformDescriptor 1319
-#define RPC_cudnnDestroyTensorTransformDescriptor 1320
-#define RPC_cudnnCreateOpTensorDescriptor 1321
-#define RPC_cudnnSetOpTensorDescriptor 1322
-#define RPC_cudnnGetOpTensorDescriptor 1323
-#define RPC_cudnnDestroyOpTensorDescriptor 1324
-#define RPC_cudnnCreateReduceTensorDescriptor 1325
-#define RPC_cudnnSetReduceTensorDescriptor 1326
-#define RPC_cudnnGetReduceTensorDescriptor 1327
-#define RPC_cudnnDestroyReduceTensorDescriptor 1328
-#define RPC_cudnnGetReductionIndicesSize 1329
-#define RPC_cudnnGetReductionWorkspaceSize 1330
-#define RPC_cudnnCreateFilterDescriptor 1331
-#define RPC_cudnnSetFilter4dDescriptor 1332
-#define RPC_cudnnGetFilter4dDescriptor 1333
-#define RPC_cudnnGetFilterSizeInBytes 1334
-#define RPC_cudnnDestroyFilterDescriptor 1335
-#define RPC_cudnnCreatePoolingDescriptor 1336
-#define RPC_cudnnSetPooling2dDescriptor 1337
-#define RPC_cudnnGetPooling2dDescriptor 1338
-#define RPC_cudnnGetPooling2dForwardOutputDim 1339
-#define RPC_cudnnDestroyPoolingDescriptor 1340
-#define RPC_cudnnCreateActivationDescriptor 1341
-#define RPC_cudnnSetActivationDescriptor 1342
-#define RPC_cudnnGetActivationDescriptor 1343
-#define RPC_cudnnSetActivationDescriptorSwishBeta 1344
-#define RPC_cudnnGetActivationDescriptorSwishBeta 1345
-#define RPC_cudnnDestroyActivationDescriptor 1346
-#define RPC_cudnnActivationForward 1347
-#define RPC_cudnnCreateLRNDescriptor 1348
-#define RPC_cudnnSetLRNDescriptor 1349
-#define RPC_cudnnGetLRNDescriptor 1350
-#define RPC_cudnnDestroyLRNDescriptor 1351
-#define RPC_cudnnDeriveBNTensorDescriptor 1352
-#define RPC_cudnnDeriveNormTensorDescriptor 1353
-#define RPC_cudnnCreateSpatialTransformerDescriptor 1354
-#define RPC_cudnnDestroySpatialTransformerDescriptor 1355
-#define RPC_cudnnCreateDropoutDescriptor 1356
-#define RPC_cudnnDestroyDropoutDescriptor 1357
-#define RPC_cudnnDropoutGetStatesSize 1358
-#define RPC_cudnnDropoutGetReserveSpaceSize 1359
-#define RPC_cudnnGetDropoutDescriptor 1360
-#define RPC_cudnnOpsVersionCheck 1361
-#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1362
-#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1363
-#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1364
-#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1365
-#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1366
-#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1367
+#define RPC_cublasSgemvBatched 1158
+#define RPC_cublasTSTgemvBatched 1159
+#define RPC_cublasSgemvStridedBatched 1160
+#define RPC_cublasSgemvStridedBatched_64 1161
+#define RPC_cublasDgemvStridedBatched 1162
+#define RPC_cublasDgemvStridedBatched_64 1163
+#define RPC_cublasCgemvStridedBatched 1164
+#define RPC_cublasCgemvStridedBatched_64 1165
+#define RPC_cublasZgemvStridedBatched 1166
+#define RPC_cublasZgemvStridedBatched_64 1167
+#define RPC_cublasHSHgemvStridedBatched 1168
+#define RPC_cublasHSHgemvStridedBatched_64 1169
+#define RPC_cublasHSSgemvStridedBatched 1170
+#define RPC_cublasHSSgemvStridedBatched_64 1171
+#define RPC_cublasTSTgemvStridedBatched 1172
+#define RPC_cublasTSTgemvStridedBatched_64 1173
+#define RPC_cublasTSSgemvStridedBatched 1174
+#define RPC_cublasTSSgemvStridedBatched_64 1175
+#define RPC_cublasSgemm_v2 1176
+#define RPC_cublasSgemm_v2_64 1177
+#define RPC_cublasDgemm_v2 1178
+#define RPC_cublasDgemm_v2_64 1179
+#define RPC_cublasCgemm_v2 1180
+#define RPC_cublasCgemm_v2_64 1181
+#define RPC_cublasCgemm3m 1182
+#define RPC_cublasCgemm3m_64 1183
+#define RPC_cublasZgemm_v2 1184
+#define RPC_cublasZgemm_v2_64 1185
+#define RPC_cublasZgemm3m 1186
+#define RPC_cublasZgemm3m_64 1187
+#define RPC_cublasHgemm 1188
+#define RPC_cublasHgemm_64 1189
+#define RPC_cublasSsyrk_v2 1190
+#define RPC_cublasSsyrk_v2_64 1191
+#define RPC_cublasDsyrk_v2 1192
+#define RPC_cublasDsyrk_v2_64 1193
+#define RPC_cublasCsyrk_v2 1194
+#define RPC_cublasCsyrk_v2_64 1195
+#define RPC_cublasZsyrk_v2 1196
+#define RPC_cublasZsyrk_v2_64 1197
+#define RPC_cublasCherk_v2 1198
+#define RPC_cublasCherk_v2_64 1199
+#define RPC_cublasZherk_v2 1200
+#define RPC_cublasZherk_v2_64 1201
+#define RPC_cublasSsyr2k_v2 1202
+#define RPC_cublasSsyr2k_v2_64 1203
+#define RPC_cublasDsyr2k_v2 1204
+#define RPC_cublasDsyr2k_v2_64 1205
+#define RPC_cublasCsyr2k_v2 1206
+#define RPC_cublasCsyr2k_v2_64 1207
+#define RPC_cublasZsyr2k_v2 1208
+#define RPC_cublasZsyr2k_v2_64 1209
+#define RPC_cublasCher2k_v2 1210
+#define RPC_cublasCher2k_v2_64 1211
+#define RPC_cublasZher2k_v2 1212
+#define RPC_cublasZher2k_v2_64 1213
+#define RPC_cublasSsyrkx 1214
+#define RPC_cublasSsyrkx_64 1215
+#define RPC_cublasDsyrkx 1216
+#define RPC_cublasDsyrkx_64 1217
+#define RPC_cublasCsyrkx 1218
+#define RPC_cublasCsyrkx_64 1219
+#define RPC_cublasZsyrkx 1220
+#define RPC_cublasZsyrkx_64 1221
+#define RPC_cublasCherkx 1222
+#define RPC_cublasCherkx_64 1223
+#define RPC_cublasZherkx 1224
+#define RPC_cublasZherkx_64 1225
+#define RPC_cublasSsymm_v2 1226
+#define RPC_cublasSsymm_v2_64 1227
+#define RPC_cublasDsymm_v2 1228
+#define RPC_cublasDsymm_v2_64 1229
+#define RPC_cublasCsymm_v2 1230
+#define RPC_cublasCsymm_v2_64 1231
+#define RPC_cublasZsymm_v2 1232
+#define RPC_cublasZsymm_v2_64 1233
+#define RPC_cublasChemm_v2 1234
+#define RPC_cublasChemm_v2_64 1235
+#define RPC_cublasZhemm_v2 1236
+#define RPC_cublasZhemm_v2_64 1237
+#define RPC_cublasStrsm_v2 1238
+#define RPC_cublasStrsm_v2_64 1239
+#define RPC_cublasDtrsm_v2 1240
+#define RPC_cublasDtrsm_v2_64 1241
+#define RPC_cublasCtrsm_v2 1242
+#define RPC_cublasCtrsm_v2_64 1243
+#define RPC_cublasZtrsm_v2 1244
+#define RPC_cublasZtrsm_v2_64 1245
+#define RPC_cublasStrmm_v2 1246
+#define RPC_cublasStrmm_v2_64 1247
+#define RPC_cublasDtrmm_v2 1248
+#define RPC_cublasDtrmm_v2_64 1249
+#define RPC_cublasCtrmm_v2 1250
+#define RPC_cublasCtrmm_v2_64 1251
+#define RPC_cublasZtrmm_v2 1252
+#define RPC_cublasZtrmm_v2_64 1253
+#define RPC_cublasHgemmBatched 1254
+#define RPC_cublasHgemmBatched_64 1255
+#define RPC_cublasSgemmBatched 1256
+#define RPC_cublasSgemmBatched_64 1257
+#define RPC_cublasDgemmBatched 1258
+#define RPC_cublasDgemmBatched_64 1259
+#define RPC_cublasCgemmBatched 1260
+#define RPC_cublasCgemmBatched_64 1261
+#define RPC_cublasCgemm3mBatched 1262
+#define RPC_cublasCgemm3mBatched_64 1263
+#define RPC_cublasZgemmBatched 1264
+#define RPC_cublasZgemmBatched_64 1265
+#define RPC_cublasHgemmStridedBatched 1266
+#define RPC_cublasHgemmStridedBatched_64 1267
+#define RPC_cublasSgemmStridedBatched 1268
+#define RPC_cublasSgemmStridedBatched_64 1269
+#define RPC_cublasDgemmStridedBatched 1270
+#define RPC_cublasDgemmStridedBatched_64 1271
+#define RPC_cublasCgemmStridedBatched 1272
+#define RPC_cublasCgemmStridedBatched_64 1273
+#define RPC_cublasCgemm3mStridedBatched 1274
+#define RPC_cublasCgemm3mStridedBatched_64 1275
+#define RPC_cublasZgemmStridedBatched 1276
+#define RPC_cublasZgemmStridedBatched_64 1277
+#define RPC_cublasGemmBatchedEx 1278
+#define RPC_cublasGemmBatchedEx_64 1279
+#define RPC_cublasSgeam 1280
+#define RPC_cublasSgeam_64 1281
+#define RPC_cublasDgeam 1282
+#define RPC_cublasDgeam_64 1283
+#define RPC_cublasCgeam 1284
+#define RPC_cublasCgeam_64 1285
+#define RPC_cublasZgeam 1286
+#define RPC_cublasZgeam_64 1287
+#define RPC_cublasStrsmBatched 1288
+#define RPC_cublasStrsmBatched_64 1289
+#define RPC_cublasDtrsmBatched 1290
+#define RPC_cublasDtrsmBatched_64 1291
+#define RPC_cublasCtrsmBatched 1292
+#define RPC_cublasCtrsmBatched_64 1293
+#define RPC_cublasZtrsmBatched 1294
+#define RPC_cublasZtrsmBatched_64 1295
+#define RPC_cublasSdgmm 1296
+#define RPC_cublasSdgmm_64 1297
+#define RPC_cublasDdgmm 1298
+#define RPC_cublasDdgmm_64 1299
+#define RPC_cublasCdgmm 1300
+#define RPC_cublasCdgmm_64 1301
+#define RPC_cublasZdgmm 1302
+#define RPC_cublasZdgmm_64 1303
+#define RPC_cublasSmatinvBatched 1304
+#define RPC_cublasDmatinvBatched 1305
+#define RPC_cublasCmatinvBatched 1306
+#define RPC_cublasZmatinvBatched 1307
+#define RPC_cublasSgeqrfBatched 1308
+#define RPC_cublasDgeqrfBatched 1309
+#define RPC_cublasCgeqrfBatched 1310
+#define RPC_cublasZgeqrfBatched 1311
+#define RPC_cublasSgelsBatched 1312
+#define RPC_cublasDgelsBatched 1313
+#define RPC_cublasCgelsBatched 1314
+#define RPC_cublasZgelsBatched 1315
+#define RPC_cublasStpttr 1316
+#define RPC_cublasDtpttr 1317
+#define RPC_cublasCtpttr 1318
+#define RPC_cublasZtpttr 1319
+#define RPC_cublasStrttp 1320
+#define RPC_cublasDtrttp 1321
+#define RPC_cublasCtrttp 1322
+#define RPC_cublasZtrttp 1323
+#define RPC_cublasSgetriBatched 1324
+#define RPC_cublasDgetriBatched 1325
+#define RPC_cublasCgetriBatched 1326
+#define RPC_cublasZgetriBatched 1327
+#define RPC_cublasSgetrsBatched 1328
+#define RPC_cublasDgetrsBatched 1329
+#define RPC_cublasCgetrsBatched 1330
+#define RPC_cublasZgetrsBatched 1331
+#define RPC_cublasUint8gemmBias 1332
+#define RPC_cublasMigrateComputeType 1333
+#define RPC_cudnnGetVersion 1334
+#define RPC_cudnnGetMaxDeviceVersion 1335
+#define RPC_cudnnGetCudartVersion 1336
+#define RPC_cudnnGetErrorString 1337
+#define RPC_cudnnGetLastErrorString 1338
+#define RPC_cudnnQueryRuntimeError 1339
+#define RPC_cudnnGetProperty 1340
+#define RPC_cudnnCreate 1341
+#define RPC_cudnnDestroy 1342
+#define RPC_cudnnSetStream 1343
+#define RPC_cudnnGetStream 1344
+#define RPC_cudnnGetCallback 1345
+#define RPC_cudnnGraphVersionCheck 1346
+#define RPC_cudnnBackendCreateDescriptor 1347
+#define RPC_cudnnBackendDestroyDescriptor 1348
+#define RPC_cudnnBackendInitialize 1349
+#define RPC_cudnnBackendFinalize 1350
+#define RPC_cudnnBackendSetAttribute 1351
+#define RPC_cudnnBackendExecute 1352
+#define RPC_cudnnBackendPopulateCudaGraph 1353
+#define RPC_cudnnBackendUpdateCudaGraph 1354
+#define RPC_cudnnCreateTensorDescriptor 1355
+#define RPC_cudnnSetTensor4dDescriptor 1356
+#define RPC_cudnnSetTensor4dDescriptorEx 1357
+#define RPC_cudnnGetTensor4dDescriptor 1358
+#define RPC_cudnnGetTensorSizeInBytes 1359
+#define RPC_cudnnDestroyTensorDescriptor 1360
+#define RPC_cudnnInitTransformDest 1361
+#define RPC_cudnnCreateTensorTransformDescriptor 1362
+#define RPC_cudnnDestroyTensorTransformDescriptor 1363
+#define RPC_cudnnCreateOpTensorDescriptor 1364
+#define RPC_cudnnSetOpTensorDescriptor 1365
+#define RPC_cudnnGetOpTensorDescriptor 1366
+#define RPC_cudnnDestroyOpTensorDescriptor 1367
+#define RPC_cudnnCreateReduceTensorDescriptor 1368
+#define RPC_cudnnSetReduceTensorDescriptor 1369
+#define RPC_cudnnGetReduceTensorDescriptor 1370
+#define RPC_cudnnDestroyReduceTensorDescriptor 1371
+#define RPC_cudnnGetReductionIndicesSize 1372
+#define RPC_cudnnGetReductionWorkspaceSize 1373
+#define RPC_cudnnCreateFilterDescriptor 1374
+#define RPC_cudnnSetFilter4dDescriptor 1375
+#define RPC_cudnnGetFilter4dDescriptor 1376
+#define RPC_cudnnGetFilterSizeInBytes 1377
+#define RPC_cudnnDestroyFilterDescriptor 1378
+#define RPC_cudnnCreatePoolingDescriptor 1379
+#define RPC_cudnnSetPooling2dDescriptor 1380
+#define RPC_cudnnGetPooling2dDescriptor 1381
+#define RPC_cudnnGetPooling2dForwardOutputDim 1382
+#define RPC_cudnnDestroyPoolingDescriptor 1383
+#define RPC_cudnnCreateActivationDescriptor 1384
+#define RPC_cudnnSetActivationDescriptor 1385
+#define RPC_cudnnGetActivationDescriptor 1386
+#define RPC_cudnnSetActivationDescriptorSwishBeta 1387
+#define RPC_cudnnGetActivationDescriptorSwishBeta 1388
+#define RPC_cudnnDestroyActivationDescriptor 1389
+#define RPC_cudnnActivationForward 1390
+#define RPC_cudnnCreateLRNDescriptor 1391
+#define RPC_cudnnSetLRNDescriptor 1392
+#define RPC_cudnnGetLRNDescriptor 1393
+#define RPC_cudnnDestroyLRNDescriptor 1394
+#define RPC_cudnnDeriveBNTensorDescriptor 1395
+#define RPC_cudnnDeriveNormTensorDescriptor 1396
+#define RPC_cudnnCreateSpatialTransformerDescriptor 1397
+#define RPC_cudnnDestroySpatialTransformerDescriptor 1398
+#define RPC_cudnnCreateDropoutDescriptor 1399
+#define RPC_cudnnDestroyDropoutDescriptor 1400
+#define RPC_cudnnDropoutGetStatesSize 1401
+#define RPC_cudnnDropoutGetReserveSpaceSize 1402
+#define RPC_cudnnGetDropoutDescriptor 1403
+#define RPC_cudnnOpsVersionCheck 1404
+#define RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize 1405
+#define RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize 1406
+#define RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize 1407
+#define RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize 1408
+#define RPC_cudnnGetNormalizationBackwardWorkspaceSize 1409
+#define RPC_cudnnGetNormalizationTrainingReserveSpaceSize 1410
diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
index dfcfe4d..b9979f5 100644
--- a/codegen/gen_client.cpp
+++ b/codegen/gen_client.cpp
@@ -35254,6 +35254,234 @@ cublasStatus_t cublasZhpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo,
   return return_value;
 }
 
+cublasStatus_t
+cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   const float *alpha, const float *const Aarray[], int lda,
+                   const float *const xarray[], int incx, const float *beta,
+                   float *const yarray[], int incy, int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)xarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)xarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)xarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)yarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)yarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)yarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incy, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasSgemvBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const float *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &xarray, sizeof(const float *const)) < 0 ||
+      rpc_write(0, &incx, sizeof(int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const float *)) < 0 ||
+      rpc_write(0, &yarray, sizeof(float *const)) < 0 ||
+      rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)xarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)xarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)xarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)yarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)yarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)yarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incy, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasTSTgemvBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const float *alpha, const __nv_bfloat16 *const Aarray[], int lda,
+    const __nv_bfloat16 *const xarray[], int incx, const float *beta,
+    __nv_bfloat16 *const yarray[], int incy, int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)xarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)xarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)xarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)yarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)yarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)yarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incy, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasTSTgemvBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const __nv_bfloat16 *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &xarray, sizeof(const __nv_bfloat16 *const)) < 0 ||
+      rpc_write(0, &incx, sizeof(int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const float *)) < 0 ||
+      rpc_write(0, &yarray, sizeof(__nv_bfloat16 *const)) < 0 ||
+      rpc_write(0, &incy, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)xarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)xarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)xarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)yarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)yarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)yarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incy, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
 cublasStatus_t
 cublasSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m,
                           int n, const float *alpha, const float *A, int lda,
@@ -42823,12 +43051,14 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side,
 }
 
 cublasStatus_t
-cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
-                          cublasOperation_t transb, int m, int n, int k,
-                          const __half *alpha, const __half *A, int lda,
-                          long long int strideA, const __half *B, int ldb,
-                          long long int strideB, const __half *beta, __half *C,
-                          int ldc, long long int strideC, int batchCount) {
+cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const __half *alpha, const __half *const Aarray[], int lda,
+                   const __half *const Barray[], int ldb, const __half *beta,
+                   __half *const Carray[], int ldc, int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -42841,55 +43071,61 @@ cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasHgemmStridedBatched) < 0 ||
+  if (rpc_start_request(0, RPC_cublasHgemmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
       rpc_write(0, &k, sizeof(int)) < 0 ||
       rpc_write(0, &alpha, sizeof(const __half *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) ||
-      rpc_write(0, &A, sizeof(const __half *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const __half *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const __half *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const __half *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const __half *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) ||
-      rpc_write(0, C, sizeof(__half)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(__half)) < 0 ||
+      rpc_write(0, &Carray, sizeof(__half *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -42904,38 +43140,52 @@ cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasHgemmStridedBatched_64(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int64_t m, int64_t n, int64_t k, const __half *alpha, const __half *A,
-    int64_t lda, long long int strideA, const __half *B, int64_t ldb,
-    long long int strideB, const __half *beta, __half *C, int64_t ldc,
-    long long int strideC, int64_t batchCount) {
+cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle,
+                                     cublasOperation_t transa,
+                                     cublasOperation_t transb, int64_t m,
+                                     int64_t n, int64_t k, const __half *alpha,
+                                     const __half *const Aarray[], int64_t lda,
+                                     const __half *const Barray[], int64_t ldb,
+                                     const __half *beta, __half *const Carray[],
+                                     int64_t ldc, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -42948,33 +43198,43 @@ cublasStatus_t cublasHgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasHgemmStridedBatched_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasHgemmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -42982,22 +43242,18 @@ cublasStatus_t cublasHgemmStridedBatched_64(
       rpc_write(0, &n, sizeof(int64_t)) < 0 ||
       rpc_write(0, &k, sizeof(int64_t)) < 0 ||
       rpc_write(0, &alpha, sizeof(const __half *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) ||
-      rpc_write(0, &A, sizeof(const __half *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const __half *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const __half *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const __half *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const __half *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) ||
-      rpc_write(0, C, sizeof(__half)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(__half)) < 0 ||
+      rpc_write(0, &Carray, sizeof(__half *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43012,39 +43268,50 @@ cublasStatus_t cublasHgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
 cublasStatus_t
-cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
-                          cublasOperation_t transb, int m, int n, int k,
-                          const float *alpha, const float *A, int lda,
-                          long long int strideA, const float *B, int ldb,
-                          long long int strideB, const float *beta, float *C,
-                          int ldc, long long int strideC, int batchCount) {
+cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const float *alpha, const float *const Aarray[], int lda,
+                   const float *const Barray[], int ldb, const float *beta,
+                   float *const Carray[], int ldc, int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43057,55 +43324,61 @@ cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasSgemmStridedBatched) < 0 ||
+  if (rpc_start_request(0, RPC_cublasSgemmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
       rpc_write(0, &k, sizeof(int)) < 0 ||
       rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
-      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const float *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const float *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const float *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const float *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) ||
-      rpc_write(0, C, sizeof(float)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &Carray, sizeof(float *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43120,38 +43393,52 @@ cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasSgemmStridedBatched_64(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int64_t m, int64_t n, int64_t k, const float *alpha, const float *A,
-    int64_t lda, long long int strideA, const float *B, int64_t ldb,
-    long long int strideB, const float *beta, float *C, int64_t ldc,
-    long long int strideC, int64_t batchCount) {
+cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle,
+                                     cublasOperation_t transa,
+                                     cublasOperation_t transb, int64_t m,
+                                     int64_t n, int64_t k, const float *alpha,
+                                     const float *const Aarray[], int64_t lda,
+                                     const float *const Barray[], int64_t ldb,
+                                     const float *beta, float *const Carray[],
+                                     int64_t ldc, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43164,33 +43451,43 @@ cublasStatus_t cublasSgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasSgemmStridedBatched_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasSgemmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -43198,22 +43495,18 @@ cublasStatus_t cublasSgemmStridedBatched_64(
       rpc_write(0, &n, sizeof(int64_t)) < 0 ||
       rpc_write(0, &k, sizeof(int64_t)) < 0 ||
       rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
-      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const float *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const float *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const float *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const float *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) ||
-      rpc_write(0, C, sizeof(float)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &Carray, sizeof(float *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43228,39 +43521,50 @@ cublasStatus_t cublasSgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
 cublasStatus_t
-cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
-                          cublasOperation_t transb, int m, int n, int k,
-                          const double *alpha, const double *A, int lda,
-                          long long int strideA, const double *B, int ldb,
-                          long long int strideB, const double *beta, double *C,
-                          int ldc, long long int strideC, int batchCount) {
+cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const double *alpha, const double *const Aarray[], int lda,
+                   const double *const Barray[], int ldb, const double *beta,
+                   double *const Carray[], int ldc, int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43275,31 +43579,41 @@ cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasDgemmStridedBatched) < 0 ||
+  if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -43307,21 +43621,19 @@ cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
       rpc_write(0, &k, sizeof(int)) < 0 ||
       rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
       (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
-      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const double *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const double *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const double *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const double *)) < 0 ||
       (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) ||
-      rpc_write(0, C, sizeof(double)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &Carray, sizeof(double *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43336,38 +43648,52 @@ cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasDgemmStridedBatched_64(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int64_t m, int64_t n, int64_t k, const double *alpha, const double *A,
-    int64_t lda, long long int strideA, const double *B, int64_t ldb,
-    long long int strideB, const double *beta, double *C, int64_t ldc,
-    long long int strideC, int64_t batchCount) {
+cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle,
+                                     cublasOperation_t transa,
+                                     cublasOperation_t transb, int64_t m,
+                                     int64_t n, int64_t k, const double *alpha,
+                                     const double *const Aarray[], int64_t lda,
+                                     const double *const Barray[], int64_t ldb,
+                                     const double *beta, double *const Carray[],
+                                     int64_t ldc, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43382,31 +43708,41 @@ cublasStatus_t cublasDgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasDgemmStridedBatched_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasDgemmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -43415,21 +43751,19 @@ cublasStatus_t cublasDgemmStridedBatched_64(
       rpc_write(0, &k, sizeof(int64_t)) < 0 ||
       rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
       (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
-      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const double *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const double *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const double *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const double *)) < 0 ||
       (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) ||
-      rpc_write(0, C, sizeof(double)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &Carray, sizeof(double *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43444,38 +43778,51 @@ cublasStatus_t cublasDgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCgemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const cuComplex *A, int lda,
-    long long int strideA, const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, cuComplex *C, int ldc, long long int strideC,
-    int batchCount) {
+cublasStatus_t
+cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const cuComplex *alpha, const cuComplex *const Aarray[],
+                   int lda, const cuComplex *const Barray[], int ldb,
+                   const cuComplex *beta, cuComplex *const Carray[], int ldc,
+                   int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43490,31 +43837,41 @@ cublasStatus_t cublasCgemmStridedBatched(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCgemmStridedBatched) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgemmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -43522,21 +43879,19 @@ cublasStatus_t cublasCgemmStridedBatched(
       rpc_write(0, &k, sizeof(int)) < 0 ||
       rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
       (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
       (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43551,38 +43906,51 @@ cublasStatus_t cublasCgemmStridedBatched(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCgemmStridedBatched_64(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int64_t m, int64_t n, int64_t k, const cuComplex *alpha, const cuComplex *A,
-    int64_t lda, long long int strideA, const cuComplex *B, int64_t ldb,
-    long long int strideB, const cuComplex *beta, cuComplex *C, int64_t ldc,
-    long long int strideC, int64_t batchCount) {
+cublasStatus_t
+cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa,
+                      cublasOperation_t transb, int64_t m, int64_t n, int64_t k,
+                      const cuComplex *alpha, const cuComplex *const Aarray[],
+                      int64_t lda, const cuComplex *const Barray[], int64_t ldb,
+                      const cuComplex *beta, cuComplex *const Carray[],
+                      int64_t ldc, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43597,31 +43965,41 @@ cublasStatus_t cublasCgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCgemmStridedBatched_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgemmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -43630,21 +44008,19 @@ cublasStatus_t cublasCgemmStridedBatched_64(
       rpc_write(0, &k, sizeof(int64_t)) < 0 ||
       rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
       (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
       (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43659,38 +44035,51 @@ cublasStatus_t cublasCgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCgemm3mStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuComplex *alpha, const cuComplex *A, int lda,
-    long long int strideA, const cuComplex *B, int ldb, long long int strideB,
-    const cuComplex *beta, cuComplex *C, int ldc, long long int strideC,
-    int batchCount) {
+cublasStatus_t
+cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa,
+                     cublasOperation_t transb, int m, int n, int k,
+                     const cuComplex *alpha, const cuComplex *const Aarray[],
+                     int lda, const cuComplex *const Barray[], int ldb,
+                     const cuComplex *beta, cuComplex *const Carray[], int ldc,
+                     int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43705,31 +44094,41 @@ cublasStatus_t cublasCgemm3mStridedBatched(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgemm3mBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -43737,21 +44136,19 @@ cublasStatus_t cublasCgemm3mStridedBatched(
       rpc_write(0, &k, sizeof(int)) < 0 ||
       rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
       (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
       (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43766,38 +44163,50 @@ cublasStatus_t cublasCgemm3mStridedBatched(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCgemm3mStridedBatched_64(
+cublasStatus_t cublasCgemm3mBatched_64(
     cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int64_t m, int64_t n, int64_t k, const cuComplex *alpha, const cuComplex *A,
-    int64_t lda, long long int strideA, const cuComplex *B, int64_t ldb,
-    long long int strideB, const cuComplex *beta, cuComplex *C, int64_t ldc,
-    long long int strideC, int64_t batchCount) {
+    int64_t m, int64_t n, int64_t k, const cuComplex *alpha,
+    const cuComplex *const Aarray[], int64_t lda,
+    const cuComplex *const Barray[], int64_t ldb, const cuComplex *beta,
+    cuComplex *const Carray[], int64_t ldc, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43812,31 +44221,41 @@ cublasStatus_t cublasCgemm3mStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgemm3mBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -43845,21 +44264,19 @@ cublasStatus_t cublasCgemm3mStridedBatched_64(
       rpc_write(0, &k, sizeof(int64_t)) < 0 ||
       rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
       (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
       (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
-      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43874,38 +44291,50 @@ cublasStatus_t cublasCgemm3mStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasZgemmStridedBatched(
+cublasStatus_t cublasZgemmBatched(
     cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A,
-    int lda, long long int strideA, const cuDoubleComplex *B, int ldb,
-    long long int strideB, const cuDoubleComplex *beta, cuDoubleComplex *C,
-    int ldc, long long int strideC, int batchCount) {
+    int m, int n, int k, const cuDoubleComplex *alpha,
+    const cuDoubleComplex *const Aarray[], int lda,
+    const cuDoubleComplex *const Barray[], int ldb, const cuDoubleComplex *beta,
+    cuDoubleComplex *const Carray[], int ldc, int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -43920,31 +44349,41 @@ cublasStatus_t cublasZgemmStridedBatched(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasZgemmStridedBatched) < 0 ||
+  if (rpc_start_request(0, RPC_cublasZgemmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -43953,23 +44392,20 @@ cublasStatus_t cublasZgemmStridedBatched(
       rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
       (alpha != nullptr &&
        rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const cuDoubleComplex *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const cuDoubleComplex *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 ||
       (beta != nullptr &&
        rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
-      rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &Carray, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -43984,39 +44420,51 @@ cublasStatus_t cublasZgemmStridedBatched(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasZgemmStridedBatched_64(
+cublasStatus_t cublasZgemmBatched_64(
     cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
     int64_t m, int64_t n, int64_t k, const cuDoubleComplex *alpha,
-    const cuDoubleComplex *A, int64_t lda, long long int strideA,
-    const cuDoubleComplex *B, int64_t ldb, long long int strideB,
-    const cuDoubleComplex *beta, cuDoubleComplex *C, int64_t ldc,
-    long long int strideC, int64_t batchCount) {
+    const cuDoubleComplex *const Aarray[], int64_t lda,
+    const cuDoubleComplex *const Barray[], int64_t ldb,
+    const cuDoubleComplex *beta, cuDoubleComplex *const Carray[], int64_t ldc,
+    int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44031,31 +44479,41 @@ cublasStatus_t cublasZgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasZgemmStridedBatched_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasZgemmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
@@ -44065,23 +44523,20 @@ cublasStatus_t cublasZgemmStridedBatched_64(
       rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
       (alpha != nullptr &&
        rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const cuDoubleComplex *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
-      rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const cuDoubleComplex *const)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
       rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 ||
       (beta != nullptr &&
        rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
-      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
-      rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &Carray, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
@@ -44096,37 +44551,48 @@ cublasStatus_t cublasZgemmStridedBatched_64(
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
-      0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n,
-                           const float *alpha, const float *A, int lda,
-                           const float *beta, const float *B, int ldb, float *C,
-                           int ldc) {
+cublasStatus_t
+cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
+                          cublasOperation_t transb, int m, int n, int k,
+                          const __half *alpha, const __half *A, int lda,
+                          long long int strideA, const __half *B, int ldb,
+                          long long int strideB, const __half *beta, __half *C,
+                          int ldc, long long int strideC, int batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44137,39 +44603,55 @@ cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasSgeam) < 0 ||
+  if (rpc_start_request(0, RPC_cublasHgemmStridedBatched) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
-      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &k, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const __half *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) ||
+      rpc_write(0, &A, sizeof(const __half *)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &beta, sizeof(const float *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) ||
-      rpc_write(0, &B, sizeof(const float *)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const __half *)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, C, sizeof(float)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const __half *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) ||
+      rpc_write(0, C, sizeof(__half)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(__half)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
@@ -44182,30 +44664,42 @@ cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int64_t m, int64_t n,
-                              const float *alpha, const float *A, int64_t lda,
-                              const float *beta, const float *B, int64_t ldb,
-                              float *C, int64_t ldc) {
+cublasStatus_t cublasHgemmStridedBatched_64(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int64_t m, int64_t n, int64_t k, const __half *alpha, const __half *A,
+    int64_t lda, long long int strideA, const __half *B, int64_t ldb,
+    long long int strideB, const __half *beta, __half *C, int64_t ldc,
+    long long int strideC, int64_t batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44216,40 +44710,56 @@ cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasSgeam_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasHgemmStridedBatched_64) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int64_t)) < 0 ||
       rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
-      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const __half *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const __half)) < 0) ||
+      rpc_write(0, &A, sizeof(const __half *)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &beta, sizeof(const float *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) ||
-      rpc_write(0, &B, sizeof(const float *)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const __half *)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, C, sizeof(float)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const __half *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const __half)) < 0) ||
+      rpc_write(0, C, sizeof(__half)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(__half)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
@@ -44262,30 +44772,43 @@ cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n,
-                           const double *alpha, const double *A, int lda,
-                           const double *beta, const double *B, int ldb,
-                           double *C, int ldc) {
+cublasStatus_t
+cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
+                          cublasOperation_t transb, int m, int n, int k,
+                          const float *alpha, const float *A, int lda,
+                          long long int strideA, const float *B, int ldb,
+                          long long int strideB, const float *beta, float *C,
+                          int ldc, long long int strideC, int batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44296,39 +44819,55 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasDgeam) < 0 ||
+  if (rpc_start_request(0, RPC_cublasSgemmStridedBatched) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
-      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &k, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
+      rpc_write(0, &A, sizeof(const float *)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &beta, sizeof(const double *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) ||
-      rpc_write(0, &B, sizeof(const double *)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const float *)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, C, sizeof(double)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const float *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) ||
+      rpc_write(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
@@ -44341,30 +44880,42 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int64_t m, int64_t n,
-                              const double *alpha, const double *A, int64_t lda,
-                              const double *beta, const double *B, int64_t ldb,
-                              double *C, int64_t ldc) {
+cublasStatus_t cublasSgemmStridedBatched_64(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int64_t m, int64_t n, int64_t k, const float *alpha, const float *A,
+    int64_t lda, long long int strideA, const float *B, int64_t ldb,
+    long long int strideB, const float *beta, float *C, int64_t ldc,
+    long long int strideC, int64_t batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44375,40 +44926,56 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasDgeam_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasSgemmStridedBatched_64) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int64_t)) < 0 ||
       rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
-      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
+      rpc_write(0, &A, sizeof(const float *)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &beta, sizeof(const double *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) ||
-      rpc_write(0, &B, sizeof(const double *)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const float *)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, C, sizeof(double)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const float *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) ||
+      rpc_write(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(float)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
@@ -44421,30 +44988,43 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n,
-                           const cuComplex *alpha, const cuComplex *A, int lda,
-                           const cuComplex *beta, const cuComplex *B, int ldb,
-                           cuComplex *C, int ldc) {
+cublasStatus_t
+cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa,
+                          cublasOperation_t transb, int m, int n, int k,
+                          const double *alpha, const double *A, int lda,
+                          long long int strideA, const double *B, int ldb,
+                          long long int strideB, const double *beta, double *C,
+                          int ldc, long long int strideC, int batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44455,39 +45035,55 @@ cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCgeam) < 0 ||
+  if (rpc_start_request(0, RPC_cublasDgemmStridedBatched) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &k, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
+      rpc_write(0, &A, sizeof(const double *)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const double *)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const double *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) ||
+      rpc_write(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
@@ -44500,31 +45096,42 @@ cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int64_t m, int64_t n,
-                              const cuComplex *alpha, const cuComplex *A,
-                              int64_t lda, const cuComplex *beta,
-                              const cuComplex *B, int64_t ldb, cuComplex *C,
-                              int64_t ldc) {
+cublasStatus_t cublasDgemmStridedBatched_64(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int64_t m, int64_t n, int64_t k, const double *alpha, const double *A,
+    int64_t lda, long long int strideA, const double *B, int64_t ldb,
+    long long int strideB, const double *beta, double *C, int64_t ldc,
+    long long int strideC, int64_t batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44535,40 +45142,56 @@ cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCgeam_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasDgemmStridedBatched_64) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int64_t)) < 0 ||
       rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
-      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
+      rpc_write(0, &A, sizeof(const double *)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
-      (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const double *)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const double *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) ||
+      rpc_write(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(double)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
@@ -44581,32 +45204,42 @@ cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n,
-                           const cuDoubleComplex *alpha,
-                           const cuDoubleComplex *A, int lda,
-                           const cuDoubleComplex *beta,
-                           const cuDoubleComplex *B, int ldb,
-                           cuDoubleComplex *C, int ldc) {
+cublasStatus_t cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const cuComplex *A, int lda,
+    long long int strideA, const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, cuComplex *C, int ldc, long long int strideC,
+    int batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44617,41 +45250,55 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasZgeam) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgemmStridedBatched) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
-      (alpha != nullptr &&
-       rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &k, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 ||
-      (beta != nullptr &&
-       rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
       rpc_write(0, &ldb, sizeof(int)) < 0 ||
-      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
@@ -44664,32 +45311,42 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int64_t m, int64_t n,
-                              const cuDoubleComplex *alpha,
-                              const cuDoubleComplex *A, int64_t lda,
-                              const cuDoubleComplex *beta,
-                              const cuDoubleComplex *B, int64_t ldb,
-                              cuDoubleComplex *C, int64_t ldc) {
+cublasStatus_t cublasCgemmStridedBatched_64(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int64_t m, int64_t n, int64_t k, const cuComplex *alpha, const cuComplex *A,
+    int64_t lda, long long int strideA, const cuComplex *B, int64_t ldb,
+    long long int strideB, const cuComplex *beta, cuComplex *C, int64_t ldc,
+    long long int strideC, int64_t batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
@@ -44700,42 +45357,56 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasZgeam_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgemmStridedBatched_64) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int64_t)) < 0 ||
       rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
-      (alpha != nullptr &&
-       rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 ||
-      (beta != nullptr &&
-       rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
       rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
@@ -44748,317 +45419,3750 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
-                           int n, const float *A, int lda, const float *x,
-                           int incx, float *C, int ldc) {
+cublasStatus_t cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const cuComplex *A, int lda,
+    long long int strideA, const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, cuComplex *C, int ldc, long long int strideC,
+    int batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasSdgmm) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &k, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &x, sizeof(const float *)) < 0 ||
-      (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) ||
-      rpc_write(0, &incx, sizeof(int)) < 0 ||
-      rpc_write(0, C, sizeof(float)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
-                              int64_t m, int64_t n, const float *A, int64_t lda,
-                              const float *x, int64_t incx, float *C,
-                              int64_t ldc) {
+cublasStatus_t cublasCgemm3mStridedBatched_64(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int64_t m, int64_t n, int64_t k, const cuComplex *alpha, const cuComplex *A,
+    int64_t lda, long long int strideA, const cuComplex *B, int64_t ldb,
+    long long int strideB, const cuComplex *beta, cuComplex *C, int64_t ldc,
+    long long int strideC, int64_t batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasSdgmm_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched_64) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int64_t)) < 0 ||
       rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &x, sizeof(const float *)) < 0 ||
-      (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) ||
-      rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-      rpc_write(0, C, sizeof(float)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, C, sizeof(cuComplex)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
-                           int n, const double *A, int lda, const double *x,
-                           int incx, double *C, int ldc) {
+cublasStatus_t cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A,
+    int lda, long long int strideA, const cuDoubleComplex *B, int ldb,
+    long long int strideB, const cuDoubleComplex *beta, cuDoubleComplex *C,
+    int ldc, long long int strideC, int batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasDdgmm) < 0 ||
+  if (rpc_start_request(0, RPC_cublasZgemmStridedBatched) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &k, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha != nullptr &&
+       rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &x, sizeof(const double *)) < 0 ||
-      (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) ||
-      rpc_write(0, &incx, sizeof(int)) < 0 ||
-      rpc_write(0, C, sizeof(double)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 ||
+      (beta != nullptr &&
+       rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
-                              int64_t m, int64_t n, const double *A,
-                              int64_t lda, const double *x, int64_t incx,
-                              double *C, int64_t ldc) {
+cublasStatus_t cublasZgemmStridedBatched_64(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int64_t m, int64_t n, int64_t k, const cuDoubleComplex *alpha,
+    const cuDoubleComplex *A, int64_t lda, long long int strideA,
+    const cuDoubleComplex *B, int64_t ldb, long long int strideB,
+    const cuDoubleComplex *beta, cuDoubleComplex *C, int64_t ldc,
+    long long int strideC, int64_t batchCount) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasDdgmm_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasZgemmStridedBatched_64) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &m, sizeof(int64_t)) < 0 ||
       rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha != nullptr &&
+       rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
       rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &x, sizeof(const double *)) < 0 ||
-      (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) ||
-      rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-      rpc_write(0, C, sizeof(double)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &strideA, sizeof(long long int)) < 0 ||
+      rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &strideB, sizeof(long long int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 ||
+      (beta != nullptr &&
+       rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &strideC, sizeof(long long int)) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&strideA, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideB, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&strideC, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
-                           int n, const cuComplex *A, int lda,
-                           const cuComplex *x, int incx, cuComplex *C,
-                           int ldc) {
+cublasStatus_t
+cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t transa,
+                       cublasOperation_t transb, int64_t m, int64_t n,
+                       int64_t k, const void *alpha, const void *const Aarray[],
+                       cudaDataType Atype, int64_t lda,
+                       const void *const Barray[], cudaDataType Btype,
+                       int64_t ldb, const void *beta, void *const Carray[],
+                       cudaDataType Ctype, int64_t ldc, int64_t batchCount,
+                       cublasComputeType_t computeType, cublasGemmAlgo_t algo) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&Atype, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&Btype, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&Ctype, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&computeType, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&algo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCdgmm) < 0 ||
+  if (rpc_start_request(0, RPC_cublasGemmBatchedEx_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
-      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &k, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const void *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const void *)) < 0) ||
+      rpc_write(0, &Aarray, sizeof(const void *const)) < 0 ||
+      rpc_write(0, &Atype, sizeof(cudaDataType)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &Barray, sizeof(const void *const)) < 0 ||
+      rpc_write(0, &Btype, sizeof(cudaDataType)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &beta, sizeof(const void *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const void *)) < 0) ||
+      rpc_write(0, &Carray, sizeof(void *const)) < 0 ||
+      rpc_write(0, &Ctype, sizeof(cudaDataType)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &computeType, sizeof(cublasComputeType_t)) < 0 ||
+      rpc_write(0, &algo, sizeof(cublasGemmAlgo_t)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&k, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&Atype, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&Btype, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchCount) &&
+                  is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&Ctype, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&computeType, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&algo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n,
+                           const float *alpha, const float *A, int lda,
+                           const float *beta, const float *B, int ldb, float *C,
+                           int ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasSgeam) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
+      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const float *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) ||
+      rpc_write(0, &B, sizeof(const float *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int64_t m, int64_t n,
+                              const float *alpha, const float *A, int64_t lda,
+                              const float *beta, const float *B, int64_t ldb,
+                              float *C, int64_t ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasSgeam_64) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
+      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &beta, sizeof(const float *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const float)) < 0) ||
+      rpc_write(0, &B, sizeof(const float *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_write(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n,
+                           const double *alpha, const double *A, int lda,
+                           const double *beta, const double *B, int ldb,
+                           double *C, int ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDgeam) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
+      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const double *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) ||
+      rpc_write(0, &B, sizeof(const double *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int64_t m, int64_t n,
+                              const double *alpha, const double *A, int64_t lda,
+                              const double *beta, const double *B, int64_t ldb,
+                              double *C, int64_t ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDgeam_64) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
+      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &beta, sizeof(const double *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const double)) < 0) ||
+      rpc_write(0, &B, sizeof(const double *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_write(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n,
+                           const cuComplex *alpha, const cuComplex *A, int lda,
+                           const cuComplex *beta, const cuComplex *B, int ldb,
+                           cuComplex *C, int ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCgeam) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int64_t m, int64_t n,
+                              const cuComplex *alpha, const cuComplex *A,
+                              int64_t lda, const cuComplex *beta,
+                              const cuComplex *B, int64_t ldb, cuComplex *C,
+                              int64_t ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCgeam_64) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuComplex *)) < 0 ||
+      (beta != nullptr && rpc_write(0, beta, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa,
+                           cublasOperation_t transb, int m, int n,
+                           const cuDoubleComplex *alpha,
+                           const cuDoubleComplex *A, int lda,
+                           const cuDoubleComplex *beta,
+                           const cuDoubleComplex *B, int ldb,
+                           cuDoubleComplex *C, int ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZgeam) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha != nullptr &&
+       rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 ||
+      (beta != nullptr &&
+       rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int64_t m, int64_t n,
+                              const cuDoubleComplex *alpha,
+                              const cuDoubleComplex *A, int64_t lda,
+                              const cuDoubleComplex *beta,
+                              const cuDoubleComplex *B, int64_t ldb,
+                              cuDoubleComplex *C, int64_t ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZgeam_64) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha != nullptr &&
+       rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &beta, sizeof(const cuDoubleComplex *)) < 0 ||
+      (beta != nullptr &&
+       rpc_write(0, beta, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transa, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&transb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)beta, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
+                                  cublasFillMode_t uplo,
+                                  cublasOperation_t trans,
+                                  cublasDiagType_t diag, int m, int n,
+                                  const float *alpha, const float *const A[],
+                                  int lda, float *const B[], int ldb,
+                                  int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasStrsmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
+      rpc_write(0, &A, sizeof(const float *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &B, sizeof(float *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t
+cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side,
+                      cublasFillMode_t uplo, cublasOperation_t trans,
+                      cublasDiagType_t diag, int64_t m, int64_t n,
+                      const float *alpha, const float *const A[], int64_t lda,
+                      float *const B[], int64_t ldb, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasStrsmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const float *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const float)) < 0) ||
+      rpc_write(0, &A, sizeof(const float *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &B, sizeof(float *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
+                                  cublasFillMode_t uplo,
+                                  cublasOperation_t trans,
+                                  cublasDiagType_t diag, int m, int n,
+                                  const double *alpha, const double *const A[],
+                                  int lda, double *const B[], int ldb,
+                                  int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDtrsmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
+      rpc_write(0, &A, sizeof(const double *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &B, sizeof(double *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t
+cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side,
+                      cublasFillMode_t uplo, cublasOperation_t trans,
+                      cublasDiagType_t diag, int64_t m, int64_t n,
+                      const double *alpha, const double *const A[], int64_t lda,
+                      double *const B[], int64_t ldb, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDtrsmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const double *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const double)) < 0) ||
+      rpc_write(0, &A, sizeof(const double *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &B, sizeof(double *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t
+cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side,
+                   cublasFillMode_t uplo, cublasOperation_t trans,
+                   cublasDiagType_t diag, int m, int n, const cuComplex *alpha,
+                   const cuComplex *const A[], int lda, cuComplex *const B[],
+                   int ldb, int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCtrsmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &B, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCtrsmBatched_64(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n,
+    const cuComplex *alpha, const cuComplex *const A[], int64_t lda,
+    cuComplex *const B[], int64_t ldb, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCtrsmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuComplex *)) < 0 ||
+      (alpha != nullptr && rpc_write(0, alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &B, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, const cuDoubleComplex *const A[], int lda,
+    cuDoubleComplex *const B[], int ldb, int batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZtrsmBatched) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha != nullptr &&
+       rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &B, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZtrsmBatched_64(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n,
+    const cuDoubleComplex *alpha, const cuDoubleComplex *const A[], int64_t lda,
+    cuDoubleComplex *const B[], int64_t ldb, int64_t batchCount) {
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZtrsmBatched_64) < 0 ||
+      rpc_write(0, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &alpha, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha != nullptr &&
+       rpc_write(0, alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &B, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchCount, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&side, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&diag, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)alpha, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)A);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)B, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchCount) && is_unified_pointer(0, (void *)B);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)B[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
+                           int n, const float *A, int lda, const float *x,
+                           int incx, float *C, int ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasSdgmm) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &x, sizeof(const float *)) < 0 ||
+      (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) ||
+      rpc_write(0, &incx, sizeof(int)) < 0 ||
+      rpc_write(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
+                              int64_t m, int64_t n, const float *A, int64_t lda,
+                              const float *x, int64_t incx, float *C,
+                              int64_t ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasSdgmm_64) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &x, sizeof(const float *)) < 0 ||
+      (x != nullptr && rpc_write(0, x, sizeof(const float)) < 0) ||
+      rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
+      rpc_write(0, C, sizeof(float)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(float)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
+                           int n, const double *A, int lda, const double *x,
+                           int incx, double *C, int ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDdgmm) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &x, sizeof(const double *)) < 0 ||
+      (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) ||
+      rpc_write(0, &incx, sizeof(int)) < 0 ||
+      rpc_write(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
+                              int64_t m, int64_t n, const double *A,
+                              int64_t lda, const double *x, int64_t incx,
+                              double *C, int64_t ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDdgmm_64) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &x, sizeof(const double *)) < 0 ||
+      (x != nullptr && rpc_write(0, x, sizeof(const double)) < 0) ||
+      rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
+      rpc_write(0, C, sizeof(double)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(double)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
+                           int n, const cuComplex *A, int lda,
+                           const cuComplex *x, int incx, cuComplex *C,
+                           int ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCdgmm) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &x, sizeof(const cuComplex *)) < 0 ||
+      (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &incx, sizeof(int)) < 0 ||
+      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
+                              int64_t m, int64_t n, const cuComplex *A,
+                              int64_t lda, const cuComplex *x, int64_t incx,
+                              cuComplex *C, int64_t ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCdgmm_64) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &x, sizeof(const cuComplex *)) < 0 ||
+      (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
+      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
+                           int n, const cuDoubleComplex *A, int lda,
+                           const cuDoubleComplex *x, int incx,
+                           cuDoubleComplex *C, int ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZdgmm) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &x, sizeof(const cuDoubleComplex *)) < 0 ||
+      (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &incx, sizeof(int)) < 0 ||
+      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
+                              int64_t m, int64_t n, const cuDoubleComplex *A,
+                              int64_t lda, const cuDoubleComplex *x,
+                              int64_t incx, cuDoubleComplex *C, int64_t ldc) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZdgmm_64) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &x, sizeof(const cuDoubleComplex *)) < 0 ||
+      (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
+      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n,
+                                    const float *const A[], int lda,
+                                    float *const Ainv[], int lda_inv, int *info,
+                                    int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Ainv);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasSmatinvBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const float *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &Ainv, sizeof(float *const)) < 0 ||
+      rpc_write(0, &lda_inv, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Ainv);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n,
+                                    const double *const A[], int lda,
+                                    double *const Ainv[], int lda_inv,
+                                    int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Ainv);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDmatinvBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const double *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &Ainv, sizeof(double *const)) < 0 ||
+      rpc_write(0, &lda_inv, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Ainv);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n,
+                                    const cuComplex *const A[], int lda,
+                                    cuComplex *const Ainv[], int lda_inv,
+                                    int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Ainv);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCmatinvBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &Ainv, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &lda_inv, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Ainv);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n,
+                                    const cuDoubleComplex *const A[], int lda,
+                                    cuDoubleComplex *const Ainv[], int lda_inv,
+                                    int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Ainv);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZmatinvBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &Ainv, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &lda_inv, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Ainv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Ainv);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Ainv[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda_inv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
+                                   float *const Aarray[], int lda,
+                                   float *const TauArray[], int *info,
+                                   int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchSize) &&
+                  is_unified_pointer(0, (void *)TauArray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasSgeqrfBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(float *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &TauArray, sizeof(float *const)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchSize) &&
+                  is_unified_pointer(0, (void *)TauArray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
+                                   double *const Aarray[], int lda,
+                                   double *const TauArray[], int *info,
+                                   int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchSize) &&
+                  is_unified_pointer(0, (void *)TauArray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDgeqrfBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(double *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &TauArray, sizeof(double *const)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchSize) &&
+                  is_unified_pointer(0, (void *)TauArray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
+                                   cuComplex *const Aarray[], int lda,
+                                   cuComplex *const TauArray[], int *info,
+                                   int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchSize) &&
+                  is_unified_pointer(0, (void *)TauArray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCgeqrfBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &TauArray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchSize) &&
+                  is_unified_pointer(0, (void *)TauArray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n,
+                                   cuDoubleComplex *const Aarray[], int lda,
+                                   cuDoubleComplex *const TauArray[], int *info,
+                                   int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchSize) &&
+                  is_unified_pointer(0, (void *)TauArray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZgeqrfBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &TauArray, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)TauArray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0; i < static_cast<int>(batchSize) &&
+                  is_unified_pointer(0, (void *)TauArray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)TauArray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasSgelsBatched(cublasHandle_t handle,
+                                  cublasOperation_t trans, int m, int n,
+                                  int nrhs, float *const Aarray[], int lda,
+                                  float *const Carray[], int ldc, int *info,
+                                  int *devInfoArray, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasSgelsBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(float *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &Carray, sizeof(float *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 ||
+      rpc_write(0, devInfoArray, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDgelsBatched(cublasHandle_t handle,
+                                  cublasOperation_t trans, int m, int n,
+                                  int nrhs, double *const Aarray[], int lda,
+                                  double *const Carray[], int ldc, int *info,
+                                  int *devInfoArray, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDgelsBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(double *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &Carray, sizeof(double *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 ||
+      rpc_write(0, devInfoArray, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCgelsBatched(cublasHandle_t handle,
+                                  cublasOperation_t trans, int m, int n,
+                                  int nrhs, cuComplex *const Aarray[], int lda,
+                                  cuComplex *const Carray[], int ldc, int *info,
+                                  int *devInfoArray, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCgelsBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &Carray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 ||
+      rpc_write(0, devInfoArray, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZgelsBatched(cublasHandle_t handle,
+                                  cublasOperation_t trans, int m, int n,
+                                  int nrhs, cuDoubleComplex *const Aarray[],
+                                  int lda, cuDoubleComplex *const Carray[],
+                                  int ldc, int *info, int *devInfoArray,
+                                  int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyHostToDevice) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZgelsBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &Carray, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 ||
+      rpc_write(0, devInfoArray, sizeof(int)) < 0 ||
+      rpc_wait_for_response(0) < 0 || rpc_read(0, info, sizeof(int)) < 0 ||
+      rpc_read(0, devInfoArray, sizeof(int)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Carray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Carray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Carray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)devInfoArray, cudaMemcpyDeviceToHost) <
+      0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+                            const float *AP, float *A, int lda) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasStpttr) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &AP, sizeof(const float *)) < 0 ||
+      (AP != nullptr && rpc_write(0, AP, sizeof(const float)) < 0) ||
+      rpc_write(0, A, sizeof(float)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, A, sizeof(float)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+                            const double *AP, double *A, int lda) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasDtpttr) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &AP, sizeof(const double *)) < 0 ||
+      (AP != nullptr && rpc_write(0, AP, sizeof(const double)) < 0) ||
+      rpc_write(0, A, sizeof(double)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, A, sizeof(double)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+                            const cuComplex *AP, cuComplex *A, int lda) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasCtpttr) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &AP, sizeof(const cuComplex *)) < 0 ||
+      (AP != nullptr && rpc_write(0, AP, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, A, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, A, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+                            const cuDoubleComplex *AP, cuDoubleComplex *A,
+                            int lda) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasZtpttr) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &AP, sizeof(const cuDoubleComplex *)) < 0 ||
+      (AP != nullptr && rpc_write(0, AP, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(0, &return_value) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  return return_value;
+}
+
+cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+                            const float *A, int lda, float *AP) {
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t return_value;
+  if (rpc_start_request(0, RPC_cublasStrttp) < 0 ||
+      rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const float *)) < 0 ||
+      (A != nullptr && rpc_write(0, A, sizeof(const float)) < 0) ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &x, sizeof(const cuComplex *)) < 0 ||
-      (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &incx, sizeof(int)) < 0 ||
-      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, AP, sizeof(float)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, AP, sizeof(float)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
@@ -45066,62 +49170,40 @@ cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
-                              int64_t m, int64_t n, const cuComplex *A,
-                              int64_t lda, const cuComplex *x, int64_t incx,
-                              cuComplex *C, int64_t ldc) {
+cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+                            const double *A, int lda, double *AP) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCdgmm_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasDtrttp) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
-      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
-      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &x, sizeof(const cuComplex *)) < 0 ||
-      (x != nullptr && rpc_write(0, x, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-      rpc_write(0, C, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const double *)) < 0 ||
+      (A != nullptr && rpc_write(0, A, sizeof(const double)) < 0) ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, AP, sizeof(double)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
@@ -45129,61 +49211,40 @@ cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
-                           int n, const cuDoubleComplex *A, int lda,
-                           const cuDoubleComplex *x, int incx,
-                           cuDoubleComplex *C, int ldc) {
+cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+                            const cuComplex *A, int lda, cuComplex *AP) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasZdgmm) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCtrttp) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
-      rpc_write(0, &m, sizeof(int)) < 0 || rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
+      (A != nullptr && rpc_write(0, A, sizeof(const cuComplex)) < 0) ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, &x, sizeof(const cuDoubleComplex *)) < 0 ||
-      (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, &incx, sizeof(int)) < 0 ||
-      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, AP, sizeof(cuComplex)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, AP, sizeof(cuComplex)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
@@ -45191,62 +49252,42 @@ cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
-                              int64_t m, int64_t n, const cuDoubleComplex *A,
-                              int64_t lda, const cuDoubleComplex *x,
-                              int64_t incx, cuDoubleComplex *C, int64_t ldc) {
+cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+                            const cuDoubleComplex *A, int lda,
+                            cuDoubleComplex *AP) {
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasZdgmm_64) < 0 ||
+  if (rpc_start_request(0, RPC_cublasZtrttp) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &mode, sizeof(cublasSideMode_t)) < 0 ||
-      rpc_write(0, &m, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &n, sizeof(int64_t)) < 0 ||
+      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &n, sizeof(int)) < 0 ||
       rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
-      rpc_write(0, &lda, sizeof(int64_t)) < 0 ||
-      rpc_write(0, &x, sizeof(const cuDoubleComplex *)) < 0 ||
-      (x != nullptr && rpc_write(0, x, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, &incx, sizeof(int64_t)) < 0 ||
-      rpc_write(0, C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_write(0, &ldc, sizeof(int64_t)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 ||
+      (A != nullptr && rpc_write(0, A, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&mode, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&m, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
@@ -45254,344 +49295,676 @@ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)x, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&incx, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-                            const float *AP, float *A, int lda) {
-  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n,
+                                   const float *const A[], int lda,
+                                   const int *P, float *const C[], int ldc,
+                                   int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)C); i++)
+    if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasStpttr) < 0 ||
+  if (rpc_start_request(0, RPC_cublasSgetriBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
       rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &AP, sizeof(const float *)) < 0 ||
-      (AP != nullptr && rpc_write(0, AP, sizeof(const float)) < 0) ||
-      rpc_write(0, A, sizeof(float)) < 0 ||
-      rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, A, sizeof(float)) < 0 ||
+      rpc_write(0, &A, sizeof(const float *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &P, sizeof(const int *)) < 0 ||
+      (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) ||
+      rpc_write(0, &C, sizeof(float *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)C); i++)
+    if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-                            const double *AP, double *A, int lda) {
-  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n,
+                                   const double *const A[], int lda,
+                                   const int *P, double *const C[], int ldc,
+                                   int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)C); i++)
+    if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasDtpttr) < 0 ||
+  if (rpc_start_request(0, RPC_cublasDgetriBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
       rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &AP, sizeof(const double *)) < 0 ||
-      (AP != nullptr && rpc_write(0, AP, sizeof(const double)) < 0) ||
-      rpc_write(0, A, sizeof(double)) < 0 ||
-      rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, A, sizeof(double)) < 0 ||
+      rpc_write(0, &A, sizeof(const double *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &P, sizeof(const int *)) < 0 ||
+      (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) ||
+      rpc_write(0, &C, sizeof(double *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)C); i++)
+    if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
-}
-
-cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-                            const cuComplex *AP, cuComplex *A, int lda) {
-  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+}
+
+cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n,
+                                   const cuComplex *const A[], int lda,
+                                   const int *P, cuComplex *const C[], int ldc,
+                                   int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)C); i++)
+    if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCtpttr) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgetriBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
       rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &AP, sizeof(const cuComplex *)) < 0 ||
-      (AP != nullptr && rpc_write(0, AP, sizeof(const cuComplex)) < 0) ||
-      rpc_write(0, A, sizeof(cuComplex)) < 0 ||
-      rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, A, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &P, sizeof(const int *)) < 0 ||
+      (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) ||
+      rpc_write(0, &C, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)C); i++)
+    if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-                            const cuDoubleComplex *AP, cuDoubleComplex *A,
-                            int lda) {
-  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
+cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n,
+                                   const cuDoubleComplex *const A[], int lda,
+                                   const int *P, cuDoubleComplex *const C[],
+                                   int ldc, int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)C); i++)
+    if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyHostToDevice) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasZtpttr) < 0 ||
+  if (rpc_start_request(0, RPC_cublasZgetriBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
       rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &AP, sizeof(const cuDoubleComplex *)) < 0 ||
-      (AP != nullptr && rpc_write(0, AP, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_write(0, A, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_write(0, &lda, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &A, sizeof(const cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &lda, sizeof(int)) < 0 ||
+      rpc_write(0, &P, sizeof(const int *)) < 0 ||
+      (P != nullptr && rpc_write(0, P, sizeof(const int)) < 0) ||
+      rpc_write(0, &C, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &ldc, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
-    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)A); i++)
+    if (maybe_copy_unified_arg(0, (void *)A[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)P, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)C, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)C); i++)
+    if (maybe_copy_unified_arg(0, (void *)C[i], cudaMemcpyDeviceToHost) < 0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldc, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-                            const float *A, int lda, float *AP) {
+cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle,
+                                   cublasOperation_t trans, int n, int nrhs,
+                                   const float *const Aarray[], int lda,
+                                   const int *devIpiv, float *const Barray[],
+                                   int ldb, int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasStrttp) < 0 ||
+  if (rpc_start_request(0, RPC_cublasSgetrsBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &A, sizeof(const float *)) < 0 ||
-      (A != nullptr && rpc_write(0, A, sizeof(const float)) < 0) ||
+      rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const float *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, AP, sizeof(float)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, AP, sizeof(float)) < 0 ||
+      rpc_write(0, &devIpiv, sizeof(const int *)) < 0 ||
+      (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) ||
+      rpc_write(0, &Barray, sizeof(float *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-                            const double *A, int lda, double *AP) {
+cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle,
+                                   cublasOperation_t trans, int n, int nrhs,
+                                   const double *const Aarray[], int lda,
+                                   const int *devIpiv, double *const Barray[],
+                                   int ldb, int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasDtrttp) < 0 ||
+  if (rpc_start_request(0, RPC_cublasDgetrsBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &A, sizeof(const double *)) < 0 ||
-      (A != nullptr && rpc_write(0, A, sizeof(const double)) < 0) ||
+      rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const double *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, AP, sizeof(double)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, AP, sizeof(double)) < 0 ||
+      rpc_write(0, &devIpiv, sizeof(const int *)) < 0 ||
+      (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) ||
+      rpc_write(0, &Barray, sizeof(double *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-                            const cuComplex *A, int lda, cuComplex *AP) {
+cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle,
+                                   cublasOperation_t trans, int n, int nrhs,
+                                   const cuComplex *const Aarray[], int lda,
+                                   const int *devIpiv,
+                                   cuComplex *const Barray[], int ldb,
+                                   int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasCtrttp) < 0 ||
+  if (rpc_start_request(0, RPC_cublasCgetrsBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &A, sizeof(const cuComplex *)) < 0 ||
-      (A != nullptr && rpc_write(0, A, sizeof(const cuComplex)) < 0) ||
+      rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const cuComplex *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, AP, sizeof(cuComplex)) < 0 || rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, AP, sizeof(cuComplex)) < 0 ||
+      rpc_write(0, &devIpiv, sizeof(const int *)) < 0 ||
+      rpc_write(0, &Barray, sizeof(cuComplex *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
 
-cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n,
-                            const cuDoubleComplex *A, int lda,
-                            cuDoubleComplex *AP) {
+cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle,
+                                   cublasOperation_t trans, int n, int nrhs,
+                                   const cuDoubleComplex *const Aarray[],
+                                   int lda, const int *devIpiv,
+                                   cuDoubleComplex *const Barray[], int ldb,
+                                   int *info, int batchSize) {
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyHostToDevice) < 0)
+  if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyHostToDevice) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyHostToDevice) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyHostToDevice) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   cublasStatus_t return_value;
-  if (rpc_start_request(0, RPC_cublasZtrttp) < 0 ||
+  if (rpc_start_request(0, RPC_cublasZgetrsBatched) < 0 ||
+      rpc_write(0, &batchSize, sizeof(int)) < 0 ||
       rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_write(0, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_write(0, &trans, sizeof(cublasOperation_t)) < 0 ||
       rpc_write(0, &n, sizeof(int)) < 0 ||
-      rpc_write(0, &A, sizeof(const cuDoubleComplex *)) < 0 ||
-      (A != nullptr && rpc_write(0, A, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_write(0, &nrhs, sizeof(int)) < 0 ||
+      rpc_write(0, &Aarray, sizeof(const cuDoubleComplex *const)) < 0 ||
       rpc_write(0, &lda, sizeof(int)) < 0 ||
-      rpc_write(0, AP, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_wait_for_response(0) < 0 ||
-      rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(0, &devIpiv, sizeof(const int *)) < 0 ||
+      (devIpiv != nullptr && rpc_write(0, devIpiv, sizeof(const int)) < 0) ||
+      rpc_write(0, &Barray, sizeof(cuDoubleComplex *const)) < 0 ||
+      rpc_write(0, &ldb, sizeof(int)) < 0 ||
+      rpc_write(0, info, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 ||
+      rpc_read(0, info, sizeof(int)) < 0 ||
       rpc_end_response(0, &return_value) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&batchSize, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&handle, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)&uplo, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&trans, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&n, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)A, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)&nrhs, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Aarray, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Aarray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Aarray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
   if (maybe_copy_unified_arg(0, (void *)&lda, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
-  if (maybe_copy_unified_arg(0, (void *)AP, cudaMemcpyDeviceToHost) < 0)
+  if (maybe_copy_unified_arg(0, (void *)devIpiv, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)Barray, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  for (int i = 0;
+       i < static_cast<int>(batchSize) && is_unified_pointer(0, (void *)Barray);
+       i++)
+    if (maybe_copy_unified_arg(0, (void *)Barray[i], cudaMemcpyDeviceToHost) <
+        0)
+      return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)&ldb, cudaMemcpyDeviceToHost) < 0)
+    return CUBLAS_STATUS_NOT_INITIALIZED;
+  if (maybe_copy_unified_arg(0, (void *)info, cudaMemcpyDeviceToHost) < 0)
     return CUBLAS_STATUS_NOT_INITIALIZED;
   return return_value;
 }
@@ -49363,6 +53736,8 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasChpr2_v2_64", (void *)cublasChpr2_v2_64},
     {"cublasZhpr2_v2", (void *)cublasZhpr2_v2},
     {"cublasZhpr2_v2_64", (void *)cublasZhpr2_v2_64},
+    {"cublasSgemvBatched", (void *)cublasSgemvBatched},
+    {"cublasTSTgemvBatched", (void *)cublasTSTgemvBatched},
     {"cublasSgemvStridedBatched", (void *)cublasSgemvStridedBatched},
     {"cublasSgemvStridedBatched_64", (void *)cublasSgemvStridedBatched_64},
     {"cublasDgemvStridedBatched", (void *)cublasDgemvStridedBatched},
@@ -49457,6 +53832,18 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCtrmm_v2_64", (void *)cublasCtrmm_v2_64},
     {"cublasZtrmm_v2", (void *)cublasZtrmm_v2},
     {"cublasZtrmm_v2_64", (void *)cublasZtrmm_v2_64},
+    {"cublasHgemmBatched", (void *)cublasHgemmBatched},
+    {"cublasHgemmBatched_64", (void *)cublasHgemmBatched_64},
+    {"cublasSgemmBatched", (void *)cublasSgemmBatched},
+    {"cublasSgemmBatched_64", (void *)cublasSgemmBatched_64},
+    {"cublasDgemmBatched", (void *)cublasDgemmBatched},
+    {"cublasDgemmBatched_64", (void *)cublasDgemmBatched_64},
+    {"cublasCgemmBatched", (void *)cublasCgemmBatched},
+    {"cublasCgemmBatched_64", (void *)cublasCgemmBatched_64},
+    {"cublasCgemm3mBatched", (void *)cublasCgemm3mBatched},
+    {"cublasCgemm3mBatched_64", (void *)cublasCgemm3mBatched_64},
+    {"cublasZgemmBatched", (void *)cublasZgemmBatched},
+    {"cublasZgemmBatched_64", (void *)cublasZgemmBatched_64},
     {"cublasHgemmStridedBatched", (void *)cublasHgemmStridedBatched},
     {"cublasHgemmStridedBatched_64", (void *)cublasHgemmStridedBatched_64},
     {"cublasSgemmStridedBatched", (void *)cublasSgemmStridedBatched},
@@ -49469,6 +53856,7 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCgemm3mStridedBatched_64", (void *)cublasCgemm3mStridedBatched_64},
     {"cublasZgemmStridedBatched", (void *)cublasZgemmStridedBatched},
     {"cublasZgemmStridedBatched_64", (void *)cublasZgemmStridedBatched_64},
+    {"cublasGemmBatchedEx_64", (void *)cublasGemmBatchedEx_64},
     {"cublasSgeam", (void *)cublasSgeam},
     {"cublasSgeam_64", (void *)cublasSgeam_64},
     {"cublasDgeam", (void *)cublasDgeam},
@@ -49477,6 +53865,14 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCgeam_64", (void *)cublasCgeam_64},
     {"cublasZgeam", (void *)cublasZgeam},
     {"cublasZgeam_64", (void *)cublasZgeam_64},
+    {"cublasStrsmBatched", (void *)cublasStrsmBatched},
+    {"cublasStrsmBatched_64", (void *)cublasStrsmBatched_64},
+    {"cublasDtrsmBatched", (void *)cublasDtrsmBatched},
+    {"cublasDtrsmBatched_64", (void *)cublasDtrsmBatched_64},
+    {"cublasCtrsmBatched", (void *)cublasCtrsmBatched},
+    {"cublasCtrsmBatched_64", (void *)cublasCtrsmBatched_64},
+    {"cublasZtrsmBatched", (void *)cublasZtrsmBatched},
+    {"cublasZtrsmBatched_64", (void *)cublasZtrsmBatched_64},
     {"cublasSdgmm", (void *)cublasSdgmm},
     {"cublasSdgmm_64", (void *)cublasSdgmm_64},
     {"cublasDdgmm", (void *)cublasDdgmm},
@@ -49485,6 +53881,18 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCdgmm_64", (void *)cublasCdgmm_64},
     {"cublasZdgmm", (void *)cublasZdgmm},
     {"cublasZdgmm_64", (void *)cublasZdgmm_64},
+    {"cublasSmatinvBatched", (void *)cublasSmatinvBatched},
+    {"cublasDmatinvBatched", (void *)cublasDmatinvBatched},
+    {"cublasCmatinvBatched", (void *)cublasCmatinvBatched},
+    {"cublasZmatinvBatched", (void *)cublasZmatinvBatched},
+    {"cublasSgeqrfBatched", (void *)cublasSgeqrfBatched},
+    {"cublasDgeqrfBatched", (void *)cublasDgeqrfBatched},
+    {"cublasCgeqrfBatched", (void *)cublasCgeqrfBatched},
+    {"cublasZgeqrfBatched", (void *)cublasZgeqrfBatched},
+    {"cublasSgelsBatched", (void *)cublasSgelsBatched},
+    {"cublasDgelsBatched", (void *)cublasDgelsBatched},
+    {"cublasCgelsBatched", (void *)cublasCgelsBatched},
+    {"cublasZgelsBatched", (void *)cublasZgelsBatched},
     {"cublasStpttr", (void *)cublasStpttr},
     {"cublasDtpttr", (void *)cublasDtpttr},
     {"cublasCtpttr", (void *)cublasCtpttr},
@@ -49493,6 +53901,14 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasDtrttp", (void *)cublasDtrttp},
     {"cublasCtrttp", (void *)cublasCtrttp},
     {"cublasZtrttp", (void *)cublasZtrttp},
+    {"cublasSgetriBatched", (void *)cublasSgetriBatched},
+    {"cublasDgetriBatched", (void *)cublasDgetriBatched},
+    {"cublasCgetriBatched", (void *)cublasCgetriBatched},
+    {"cublasZgetriBatched", (void *)cublasZgetriBatched},
+    {"cublasSgetrsBatched", (void *)cublasSgetrsBatched},
+    {"cublasDgetrsBatched", (void *)cublasDgetrsBatched},
+    {"cublasCgetrsBatched", (void *)cublasCgetrsBatched},
+    {"cublasZgetrsBatched", (void *)cublasZgetrsBatched},
     {"cublasUint8gemmBias", (void *)cublasUint8gemmBias},
     {"cudnnGetProperty", (void *)cudnnGetProperty},
     {"cudnnCreate", (void *)cudnnCreate},
diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
index 6d24ab5..d42ef85 100644
--- a/codegen/gen_server.cpp
+++ b/codegen/gen_server.cpp
@@ -31601,6 +31601,100 @@ int handle_cublasZhpr2_v2_64(void *conn) {
   return -1;
 }
 
+int handle_cublasSgemvBatched(void *conn) {
+  int batchCount;
+  cublasHandle_t handle;
+  cublasOperation_t trans;
+  int m;
+  int n;
+  const float *alpha;
+  const float **Aarray = nullptr;
+  int lda;
+  const float **xarray = nullptr;
+  int incx;
+  const float *beta;
+  float **yarray = nullptr;
+  int incy;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const float *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &xarray, sizeof(const float *const *)) < 0 ||
+      rpc_read(conn, &incx, sizeof(int)) < 0 ||
+      rpc_read(conn, &beta, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &yarray, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &incy, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasSgemvBatched(handle, trans, m, n, alpha, Aarray, lda, xarray, incx,
+                         beta, yarray, incy, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasTSTgemvBatched(void *conn) {
+  int batchCount;
+  cublasHandle_t handle;
+  cublasOperation_t trans;
+  int m;
+  int n;
+  const float *alpha;
+  const __nv_bfloat16 **Aarray = nullptr;
+  int lda;
+  const __nv_bfloat16 **xarray = nullptr;
+  int incx;
+  const float *beta;
+  __nv_bfloat16 **yarray = nullptr;
+  int incy;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const __nv_bfloat16 *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &xarray, sizeof(const __nv_bfloat16 *const *)) < 0 ||
+      rpc_read(conn, &incx, sizeof(int)) < 0 ||
+      rpc_read(conn, &beta, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &yarray, sizeof(__nv_bfloat16 *const *)) < 0 ||
+      rpc_read(conn, &incy, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasTSTgemvBatched(handle, trans, m, n, alpha, Aarray, lda, xarray,
+                           incx, beta, yarray, incy, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
 int handle_cublasSgemvStridedBatched(void *conn) {
   cublasHandle_t handle;
   cublasOperation_t trans;
@@ -36114,60 +36208,49 @@ int handle_cublasZtrmm_v2_64(void *conn) {
   return -1;
 }
 
-int handle_cublasHgemmStridedBatched(void *conn) {
+int handle_cublasHgemmBatched(void *conn) {
+  int batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int m;
   int n;
   int k;
-  __half *alpha_null_check;
-  __half alpha;
-  const __half *A;
+  const __half *alpha;
+  const __half **Aarray = nullptr;
   int lda;
-  long long int strideA;
-  const __half *B;
+  const __half **Barray = nullptr;
   int ldb;
-  long long int strideB;
-  __half *beta_null_check;
-  __half beta;
-  __half C;
+  const __half *beta;
+  __half **Carray = nullptr;
   int ldc;
-  long long int strideC;
-  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
       rpc_read(conn, &k, sizeof(int)) < 0 ||
-      rpc_read(conn, &alpha_null_check, sizeof(const __half *)) < 0 ||
-      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) ||
-      rpc_read(conn, &A, sizeof(const __half *)) < 0 ||
+      rpc_read(conn, &alpha, sizeof(const __half *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const __half *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const __half *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const __half *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const __half *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) ||
-      rpc_read(conn, &C, sizeof(__half)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+      rpc_read(conn, &beta, sizeof(const __half *)) < 0 ||
+      rpc_read(conn, &Carray, sizeof(__half *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasHgemmStridedBatched(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasHgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
+                         Barray, ldb, beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(__half)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36176,60 +36259,49 @@ int handle_cublasHgemmStridedBatched(void *conn) {
   return -1;
 }
 
-int handle_cublasHgemmStridedBatched_64(void *conn) {
+int handle_cublasHgemmBatched_64(void *conn) {
+  int64_t batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int64_t m;
   int64_t n;
   int64_t k;
-  __half *alpha_null_check;
-  __half alpha;
-  const __half *A;
+  const __half *alpha;
+  const __half **Aarray = nullptr;
   int64_t lda;
-  long long int strideA;
-  const __half *B;
+  const __half **Barray = nullptr;
   int64_t ldb;
-  long long int strideB;
-  __half *beta_null_check;
-  __half beta;
-  __half C;
+  const __half *beta;
+  __half **Carray = nullptr;
   int64_t ldc;
-  long long int strideC;
-  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &alpha_null_check, sizeof(const __half *)) < 0 ||
-      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) ||
-      rpc_read(conn, &A, sizeof(const __half *)) < 0 ||
+      rpc_read(conn, &alpha, sizeof(const __half *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const __half *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const __half *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const __half *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const __half *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) ||
-      rpc_read(conn, &C, sizeof(__half)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &beta, sizeof(const __half *)) < 0 ||
+      rpc_read(conn, &Carray, sizeof(__half *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasHgemmStridedBatched_64(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasHgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda,
+                            Barray, ldb, beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(__half)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36238,60 +36310,49 @@ int handle_cublasHgemmStridedBatched_64(void *conn) {
   return -1;
 }
 
-int handle_cublasSgemmStridedBatched(void *conn) {
+int handle_cublasSgemmBatched(void *conn) {
+  int batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int m;
   int n;
   int k;
-  float *alpha_null_check;
-  float alpha;
-  const float *A;
+  const float *alpha;
+  const float **Aarray = nullptr;
   int lda;
-  long long int strideA;
-  const float *B;
+  const float **Barray = nullptr;
   int ldb;
-  long long int strideB;
-  float *beta_null_check;
-  float beta;
-  float C;
+  const float *beta;
+  float **Carray = nullptr;
   int ldc;
-  long long int strideC;
-  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
       rpc_read(conn, &k, sizeof(int)) < 0 ||
-      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
-      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
-      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &alpha, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const float *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const float *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) ||
-      rpc_read(conn, &C, sizeof(float)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+      rpc_read(conn, &beta, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Carray, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasSgemmStridedBatched(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
+                         Barray, ldb, beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(float)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36300,60 +36361,49 @@ int handle_cublasSgemmStridedBatched(void *conn) {
   return -1;
 }
 
-int handle_cublasSgemmStridedBatched_64(void *conn) {
+int handle_cublasSgemmBatched_64(void *conn) {
+  int64_t batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int64_t m;
   int64_t n;
   int64_t k;
-  float *alpha_null_check;
-  float alpha;
-  const float *A;
+  const float *alpha;
+  const float **Aarray = nullptr;
   int64_t lda;
-  long long int strideA;
-  const float *B;
+  const float **Barray = nullptr;
   int64_t ldb;
-  long long int strideB;
-  float *beta_null_check;
-  float beta;
-  float C;
+  const float *beta;
+  float **Carray = nullptr;
   int64_t ldc;
-  long long int strideC;
-  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
-      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
-      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &alpha, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const float *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const float *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) ||
-      rpc_read(conn, &C, sizeof(float)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &beta, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Carray, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasSgemmStridedBatched_64(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasSgemmBatched_64(handle, transa, transb, m, n, k, alpha, Aarray, lda,
+                            Barray, ldb, beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(float)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36362,7 +36412,8 @@ int handle_cublasSgemmStridedBatched_64(void *conn) {
   return -1;
 }
 
-int handle_cublasDgemmStridedBatched(void *conn) {
+int handle_cublasDgemmBatched(void *conn) {
+  int batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
@@ -36371,21 +36422,18 @@ int handle_cublasDgemmStridedBatched(void *conn) {
   int k;
   double *alpha_null_check;
   double alpha;
-  const double *A;
+  const double **Aarray = nullptr;
   int lda;
-  long long int strideA;
-  const double *B;
+  const double **Barray = nullptr;
   int ldb;
-  long long int strideB;
   double *beta_null_check;
   double beta;
-  double C;
+  double **Carray = nullptr;
   int ldc;
-  long long int strideC;
-  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
@@ -36393,29 +36441,24 @@ int handle_cublasDgemmStridedBatched(void *conn) {
       rpc_read(conn, &k, sizeof(int)) < 0 ||
       rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 ||
       (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) ||
-      rpc_read(conn, &A, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const double *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const double *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
       rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 ||
       (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) ||
-      rpc_read(conn, &C, sizeof(double)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasDgemmStridedBatched(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasDgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda,
+                         Barray, ldb, &beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(double)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36424,7 +36467,8 @@ int handle_cublasDgemmStridedBatched(void *conn) {
   return -1;
 }
 
-int handle_cublasDgemmStridedBatched_64(void *conn) {
+int handle_cublasDgemmBatched_64(void *conn) {
+  int64_t batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
@@ -36433,21 +36477,18 @@ int handle_cublasDgemmStridedBatched_64(void *conn) {
   int64_t k;
   double *alpha_null_check;
   double alpha;
-  const double *A;
+  const double **Aarray = nullptr;
   int64_t lda;
-  long long int strideA;
-  const double *B;
+  const double **Barray = nullptr;
   int64_t ldb;
-  long long int strideB;
   double *beta_null_check;
   double beta;
-  double C;
+  double **Carray = nullptr;
   int64_t ldc;
-  long long int strideC;
-  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
@@ -36455,29 +36496,24 @@ int handle_cublasDgemmStridedBatched_64(void *conn) {
       rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 ||
       (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) ||
-      rpc_read(conn, &A, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const double *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const double *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
       rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 ||
       (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) ||
-      rpc_read(conn, &C, sizeof(double)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasDgemmStridedBatched_64(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasDgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray,
+                            lda, Barray, ldb, &beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(double)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36486,7 +36522,8 @@ int handle_cublasDgemmStridedBatched_64(void *conn) {
   return -1;
 }
 
-int handle_cublasCgemmStridedBatched(void *conn) {
+int handle_cublasCgemmBatched(void *conn) {
+  int batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
@@ -36495,21 +36532,18 @@ int handle_cublasCgemmStridedBatched(void *conn) {
   int k;
   cuComplex *alpha_null_check;
   cuComplex alpha;
-  const cuComplex *A;
+  const cuComplex **Aarray = nullptr;
   int lda;
-  long long int strideA;
-  const cuComplex *B;
+  const cuComplex **Barray = nullptr;
   int ldb;
-  long long int strideB;
   cuComplex *beta_null_check;
   cuComplex beta;
-  cuComplex C;
+  cuComplex **Carray = nullptr;
   int ldc;
-  long long int strideC;
-  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
@@ -36518,29 +36552,24 @@ int handle_cublasCgemmStridedBatched(void *conn) {
       rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
       (alpha_null_check &&
        rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
       rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
       (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasCgemmStridedBatched(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasCgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda,
+                         Barray, ldb, &beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36549,7 +36578,8 @@ int handle_cublasCgemmStridedBatched(void *conn) {
   return -1;
 }
 
-int handle_cublasCgemmStridedBatched_64(void *conn) {
+int handle_cublasCgemmBatched_64(void *conn) {
+  int64_t batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
@@ -36558,21 +36588,18 @@ int handle_cublasCgemmStridedBatched_64(void *conn) {
   int64_t k;
   cuComplex *alpha_null_check;
   cuComplex alpha;
-  const cuComplex *A;
+  const cuComplex **Aarray = nullptr;
   int64_t lda;
-  long long int strideA;
-  const cuComplex *B;
+  const cuComplex **Barray = nullptr;
   int64_t ldb;
-  long long int strideB;
   cuComplex *beta_null_check;
   cuComplex beta;
-  cuComplex C;
+  cuComplex **Carray = nullptr;
   int64_t ldc;
-  long long int strideC;
-  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
@@ -36581,29 +36608,24 @@ int handle_cublasCgemmStridedBatched_64(void *conn) {
       rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
       (alpha_null_check &&
        rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
       rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
       (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasCgemmStridedBatched_64(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasCgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray,
+                            lda, Barray, ldb, &beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36612,7 +36634,8 @@ int handle_cublasCgemmStridedBatched_64(void *conn) {
   return -1;
 }
 
-int handle_cublasCgemm3mStridedBatched(void *conn) {
+int handle_cublasCgemm3mBatched(void *conn) {
+  int batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
@@ -36621,21 +36644,18 @@ int handle_cublasCgemm3mStridedBatched(void *conn) {
   int k;
   cuComplex *alpha_null_check;
   cuComplex alpha;
-  const cuComplex *A;
+  const cuComplex **Aarray = nullptr;
   int lda;
-  long long int strideA;
-  const cuComplex *B;
+  const cuComplex **Barray = nullptr;
   int ldb;
-  long long int strideB;
   cuComplex *beta_null_check;
   cuComplex beta;
-  cuComplex C;
+  cuComplex **Carray = nullptr;
   int ldc;
-  long long int strideC;
-  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
@@ -36644,29 +36664,24 @@ int handle_cublasCgemm3mStridedBatched(void *conn) {
       rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
       (alpha_null_check &&
        rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
       rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
       (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasCgemm3mStridedBatched(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasCgemm3mBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda,
+                           Barray, ldb, &beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36675,7 +36690,8 @@ int handle_cublasCgemm3mStridedBatched(void *conn) {
   return -1;
 }
 
-int handle_cublasCgemm3mStridedBatched_64(void *conn) {
+int handle_cublasCgemm3mBatched_64(void *conn) {
+  int64_t batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
@@ -36684,21 +36700,18 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) {
   int64_t k;
   cuComplex *alpha_null_check;
   cuComplex alpha;
-  const cuComplex *A;
+  const cuComplex **Aarray = nullptr;
   int64_t lda;
-  long long int strideA;
-  const cuComplex *B;
+  const cuComplex **Barray = nullptr;
   int64_t ldb;
-  long long int strideB;
   cuComplex *beta_null_check;
   cuComplex beta;
-  cuComplex C;
+  cuComplex **Carray = nullptr;
   int64_t ldc;
-  long long int strideC;
-  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
@@ -36707,29 +36720,24 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) {
       rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
       (alpha_null_check &&
        rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
       rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
       (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasCgemm3mStridedBatched_64(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasCgemm3mBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray,
+                              lda, Barray, ldb, &beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36738,7 +36746,8 @@ int handle_cublasCgemm3mStridedBatched_64(void *conn) {
   return -1;
 }
 
-int handle_cublasZgemmStridedBatched(void *conn) {
+int handle_cublasZgemmBatched(void *conn) {
+  int batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
@@ -36747,21 +36756,18 @@ int handle_cublasZgemmStridedBatched(void *conn) {
   int k;
   cuDoubleComplex *alpha_null_check;
   cuDoubleComplex alpha;
-  const cuDoubleComplex *A;
+  const cuDoubleComplex **Aarray = nullptr;
   int lda;
-  long long int strideA;
-  const cuDoubleComplex *B;
+  const cuDoubleComplex **Barray = nullptr;
   int ldb;
-  long long int strideB;
   cuDoubleComplex *beta_null_check;
   cuDoubleComplex beta;
-  cuDoubleComplex C;
+  cuDoubleComplex **Carray = nullptr;
   int ldc;
-  long long int strideC;
-  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
@@ -36770,30 +36776,25 @@ int handle_cublasZgemmStridedBatched(void *conn) {
       rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
       (alpha_null_check &&
        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const cuDoubleComplex *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
       rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
       (beta_null_check &&
        rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasZgemmStridedBatched(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasZgemmBatched(handle, transa, transb, m, n, k, &alpha, Aarray, lda,
+                         Barray, ldb, &beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36802,7 +36803,8 @@ int handle_cublasZgemmStridedBatched(void *conn) {
   return -1;
 }
 
-int handle_cublasZgemmStridedBatched_64(void *conn) {
+int handle_cublasZgemmBatched_64(void *conn) {
+  int64_t batchCount;
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
@@ -36811,21 +36813,18 @@ int handle_cublasZgemmStridedBatched_64(void *conn) {
   int64_t k;
   cuDoubleComplex *alpha_null_check;
   cuDoubleComplex alpha;
-  const cuDoubleComplex *A;
+  const cuDoubleComplex **Aarray = nullptr;
   int64_t lda;
-  long long int strideA;
-  const cuDoubleComplex *B;
+  const cuDoubleComplex **Barray = nullptr;
   int64_t ldb;
-  long long int strideB;
   cuDoubleComplex *beta_null_check;
   cuDoubleComplex beta;
-  cuDoubleComplex C;
+  cuDoubleComplex **Carray = nullptr;
   int64_t ldc;
-  long long int strideC;
-  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
@@ -36834,30 +36833,25 @@ int handle_cublasZgemmStridedBatched_64(void *conn) {
       rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
       (alpha_null_check &&
        rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const cuDoubleComplex *const *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
       rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
       (beta_null_check &&
        rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
-      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasZgemmStridedBatched_64(
-      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
-      &beta, &C, ldc, strideC, batchCount);
+  scuda_intercept_result =
+      cublasZgemmBatched_64(handle, transa, transb, m, n, k, &alpha, Aarray,
+                            lda, Barray, ldb, &beta, Carray, ldc, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36866,22 +36860,27 @@ int handle_cublasZgemmStridedBatched_64(void *conn) {
   return -1;
 }
 
-int handle_cublasSgeam(void *conn) {
+int handle_cublasHgemmStridedBatched(void *conn) {
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int m;
   int n;
-  float *alpha_null_check;
-  float alpha;
-  const float *A;
+  int k;
+  __half *alpha_null_check;
+  __half alpha;
+  const __half *A;
   int lda;
-  float *beta_null_check;
-  float beta;
-  const float *B;
+  long long int strideA;
+  const __half *B;
   int ldb;
-  float C;
+  long long int strideB;
+  __half *beta_null_check;
+  __half beta;
+  __half C;
   int ldc;
+  long long int strideC;
+  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
   if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -36889,26 +36888,32 @@ int handle_cublasSgeam(void *conn) {
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
-      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
-      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
-      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &k, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const __half *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) ||
+      rpc_read(conn, &A, sizeof(const __half *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) ||
-      rpc_read(conn, &B, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &B, sizeof(const __half *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-      rpc_read(conn, &C, sizeof(float)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const __half *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) ||
+      rpc_read(conn, &C, sizeof(__half)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, &alpha, A,
-                                       lda, &beta, B, ldb, &C, ldc);
+  scuda_intercept_result = cublasHgemmStridedBatched(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_write(conn, &C, sizeof(__half)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36917,22 +36922,27 @@ int handle_cublasSgeam(void *conn) {
   return -1;
 }
 
-int handle_cublasSgeam_64(void *conn) {
+int handle_cublasHgemmStridedBatched_64(void *conn) {
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int64_t m;
   int64_t n;
-  float *alpha_null_check;
-  float alpha;
-  const float *A;
+  int64_t k;
+  __half *alpha_null_check;
+  __half alpha;
+  const __half *A;
   int64_t lda;
-  float *beta_null_check;
-  float beta;
-  const float *B;
+  long long int strideA;
+  const __half *B;
   int64_t ldb;
-  float C;
+  long long int strideB;
+  __half *beta_null_check;
+  __half beta;
+  __half C;
   int64_t ldc;
+  long long int strideC;
+  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
   if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -36940,26 +36950,32 @@ int handle_cublasSgeam_64(void *conn) {
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
-      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
-      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const __half *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const __half)) < 0) ||
+      rpc_read(conn, &A, sizeof(const __half *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) ||
-      rpc_read(conn, &B, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &B, sizeof(const __half *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &C, sizeof(float)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const __half *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const __half)) < 0) ||
+      rpc_read(conn, &C, sizeof(__half)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, &alpha,
-                                          A, lda, &beta, B, ldb, &C, ldc);
+  scuda_intercept_result = cublasHgemmStridedBatched_64(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_write(conn, &C, sizeof(__half)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -36968,22 +36984,151 @@ int handle_cublasSgeam_64(void *conn) {
   return -1;
 }
 
-int handle_cublasDgeam(void *conn) {
+int handle_cublasSgemmStridedBatched(void *conn) {
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int m;
   int n;
-  double *alpha_null_check;
-  double alpha;
-  const double *A;
-  int lda;
-  double *beta_null_check;
-  double beta;
+  int k;
+  float *alpha_null_check;
+  float alpha;
+  const float *A;
+  int lda;
+  long long int strideA;
+  const float *B;
+  int ldb;
+  long long int strideB;
+  float *beta_null_check;
+  float beta;
+  float C;
+  int ldc;
+  long long int strideC;
+  int batchCount;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &k, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
+      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &B, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) ||
+      rpc_read(conn, &C, sizeof(float)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasSgemmStridedBatched(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasSgemmStridedBatched_64(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  float *alpha_null_check;
+  float alpha;
+  const float *A;
+  int64_t lda;
+  long long int strideA;
+  const float *B;
+  int64_t ldb;
+  long long int strideB;
+  float *beta_null_check;
+  float beta;
+  float C;
+  int64_t ldc;
+  long long int strideC;
+  int64_t batchCount;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
+      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &B, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) ||
+      rpc_read(conn, &C, sizeof(float)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasSgemmStridedBatched_64(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDgemmStridedBatched(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int m;
+  int n;
+  int k;
+  double *alpha_null_check;
+  double alpha;
+  const double *A;
+  int lda;
+  long long int strideA;
   const double *B;
   int ldb;
+  long long int strideB;
+  double *beta_null_check;
+  double beta;
   double C;
   int ldc;
+  long long int strideC;
+  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
   if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -36991,23 +37136,29 @@ int handle_cublasDgeam(void *conn) {
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &k, sizeof(int)) < 0 ||
       rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 ||
       (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) ||
       rpc_read(conn, &A, sizeof(const double *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
       rpc_read(conn, &B, sizeof(const double *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) ||
       rpc_read(conn, &C, sizeof(double)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, &alpha, A,
-                                       lda, &beta, B, ldb, &C, ldc);
+  scuda_intercept_result = cublasDgemmStridedBatched(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
       rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -37019,22 +37170,27 @@ int handle_cublasDgeam(void *conn) {
   return -1;
 }
 
-int handle_cublasDgeam_64(void *conn) {
+int handle_cublasDgemmStridedBatched_64(void *conn) {
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int64_t m;
   int64_t n;
+  int64_t k;
   double *alpha_null_check;
   double alpha;
   const double *A;
   int64_t lda;
-  double *beta_null_check;
-  double beta;
+  long long int strideA;
   const double *B;
   int64_t ldb;
+  long long int strideB;
+  double *beta_null_check;
+  double beta;
   double C;
   int64_t ldc;
+  long long int strideC;
+  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
   if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -37042,23 +37198,29 @@ int handle_cublasDgeam_64(void *conn) {
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 ||
       (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) ||
       rpc_read(conn, &A, sizeof(const double *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
       rpc_read(conn, &B, sizeof(const double *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) ||
       rpc_read(conn, &C, sizeof(double)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, &alpha,
-                                          A, lda, &beta, B, ldb, &C, ldc);
+  scuda_intercept_result = cublasDgemmStridedBatched_64(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
       rpc_write(conn, &C, sizeof(double)) < 0 ||
@@ -37070,22 +37232,27 @@ int handle_cublasDgeam_64(void *conn) {
   return -1;
 }
 
-int handle_cublasCgeam(void *conn) {
+int handle_cublasCgemmStridedBatched(void *conn) {
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int m;
   int n;
+  int k;
   cuComplex *alpha_null_check;
   cuComplex alpha;
   const cuComplex *A;
   int lda;
-  cuComplex *beta_null_check;
-  cuComplex beta;
+  long long int strideA;
   const cuComplex *B;
   int ldb;
+  long long int strideB;
+  cuComplex *beta_null_check;
+  cuComplex beta;
   cuComplex C;
   int ldc;
+  long long int strideC;
+  int batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
   if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -37093,24 +37260,30 @@ int handle_cublasCgeam(void *conn) {
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &k, sizeof(int)) < 0 ||
       rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
       (alpha_null_check &&
        rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
       rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
       rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
       rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, &alpha, A,
-                                       lda, &beta, B, ldb, &C, ldc);
+  scuda_intercept_result = cublasCgemmStridedBatched(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
 
   if (rpc_start_response(conn, request_id) < 0 ||
       rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
@@ -37122,22 +37295,27 @@ int handle_cublasCgeam(void *conn) {
   return -1;
 }
 
-int handle_cublasCgeam_64(void *conn) {
+int handle_cublasCgemmStridedBatched_64(void *conn) {
   cublasHandle_t handle;
   cublasOperation_t transa;
   cublasOperation_t transb;
   int64_t m;
   int64_t n;
+  int64_t k;
   cuComplex *alpha_null_check;
   cuComplex alpha;
   const cuComplex *A;
   int64_t lda;
-  cuComplex *beta_null_check;
-  cuComplex beta;
+  long long int strideA;
   const cuComplex *B;
   int64_t ldb;
+  long long int strideB;
+  cuComplex *beta_null_check;
+  cuComplex beta;
   cuComplex C;
   int64_t ldc;
+  long long int strideC;
+  int64_t batchCount;
   int request_id;
   cublasStatus_t scuda_intercept_result;
   if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
@@ -37145,27 +37323,1580 @@ int handle_cublasCgeam_64(void *conn) {
       rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
       rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
       (alpha_null_check &&
        rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
       rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
-      (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
       rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
       rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
       rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasCgemmStridedBatched_64(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCgemm3mStridedBatched(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int m;
+  int n;
+  int k;
+  cuComplex *alpha_null_check;
+  cuComplex alpha;
+  const cuComplex *A;
+  int lda;
+  long long int strideA;
+  const cuComplex *B;
+  int ldb;
+  long long int strideB;
+  cuComplex *beta_null_check;
+  cuComplex beta;
+  cuComplex C;
+  int ldc;
+  long long int strideC;
+  int batchCount;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &k, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasCgemm3mStridedBatched(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCgemm3mStridedBatched_64(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  cuComplex *alpha_null_check;
+  cuComplex alpha;
+  const cuComplex *A;
+  int64_t lda;
+  long long int strideA;
+  const cuComplex *B;
+  int64_t ldb;
+  long long int strideB;
+  cuComplex *beta_null_check;
+  cuComplex beta;
+  cuComplex C;
+  int64_t ldc;
+  long long int strideC;
+  int64_t batchCount;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasCgemm3mStridedBatched_64(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZgemmStridedBatched(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int m;
+  int n;
+  int k;
+  cuDoubleComplex *alpha_null_check;
+  cuDoubleComplex alpha;
+  const cuDoubleComplex *A;
+  int lda;
+  long long int strideA;
+  const cuDoubleComplex *B;
+  int ldb;
+  long long int strideB;
+  cuDoubleComplex *beta_null_check;
+  cuDoubleComplex beta;
+  cuDoubleComplex C;
+  int ldc;
+  long long int strideC;
+  int batchCount;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &k, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (beta_null_check &&
+       rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasZgemmStridedBatched(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZgemmStridedBatched_64(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  cuDoubleComplex *alpha_null_check;
+  cuDoubleComplex alpha;
+  const cuDoubleComplex *A;
+  int64_t lda;
+  long long int strideA;
+  const cuDoubleComplex *B;
+  int64_t ldb;
+  long long int strideB;
+  cuDoubleComplex *beta_null_check;
+  cuDoubleComplex beta;
+  cuDoubleComplex C;
+  int64_t ldc;
+  long long int strideC;
+  int64_t batchCount;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideA, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideB, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (beta_null_check &&
+       rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &strideC, sizeof(long long int)) < 0 ||
+      rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasZgemmStridedBatched_64(
+      handle, transa, transb, m, n, k, &alpha, A, lda, strideA, B, ldb, strideB,
+      &beta, &C, ldc, strideC, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasGemmBatchedEx_64(void *conn) {
+  int64_t batchCount;
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  void *alpha_null_check;
+  void *alpha;
+  const void **Aarray = nullptr;
+  cudaDataType Atype;
+  int64_t lda;
+  const void **Barray = nullptr;
+  cudaDataType Btype;
+  int64_t ldb;
+  void *beta_null_check;
+  void *beta;
+  void **Carray = nullptr;
+  cudaDataType Ctype;
+  int64_t ldc;
+  cublasComputeType_t computeType;
+  cublasGemmAlgo_t algo;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &k, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const void *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const void *)) < 0) ||
+      rpc_read(conn, &Aarray, sizeof(const void *const *)) < 0 ||
+      rpc_read(conn, &Atype, sizeof(cudaDataType)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(const void *const *)) < 0 ||
+      rpc_read(conn, &Btype, sizeof(cudaDataType)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const void *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const void *)) < 0) ||
+      rpc_read(conn, &Carray, sizeof(void *const *)) < 0 ||
+      rpc_read(conn, &Ctype, sizeof(cudaDataType)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &computeType, sizeof(cublasComputeType_t)) < 0 ||
+      rpc_read(conn, &algo, sizeof(cublasGemmAlgo_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasGemmBatchedEx_64(
+      handle, transa, transb, m, n, k, &alpha, Aarray, Atype, lda, Barray,
+      Btype, ldb, &beta, Carray, Ctype, ldc, batchCount, computeType, algo);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasSgeam(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int m;
+  int n;
+  float *alpha_null_check;
+  float alpha;
+  const float *A;
+  int lda;
+  float *beta_null_check;
+  float beta;
+  const float *B;
+  int ldb;
+  float C;
+  int ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
+      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) ||
+      rpc_read(conn, &B, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &C, sizeof(float)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasSgeam(handle, transa, transb, m, n, &alpha, A,
+                                       lda, &beta, B, ldb, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasSgeam_64(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int64_t m;
+  int64_t n;
+  float *alpha_null_check;
+  float alpha;
+  const float *A;
+  int64_t lda;
+  float *beta_null_check;
+  float beta;
+  const float *B;
+  int64_t ldb;
+  float C;
+  int64_t ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
+      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const float *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const float)) < 0) ||
+      rpc_read(conn, &B, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &C, sizeof(float)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasSgeam_64(handle, transa, transb, m, n, &alpha,
+                                          A, lda, &beta, B, ldb, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDgeam(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int m;
+  int n;
+  double *alpha_null_check;
+  double alpha;
+  const double *A;
+  int lda;
+  double *beta_null_check;
+  double beta;
+  const double *B;
+  int ldb;
+  double C;
+  int ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) ||
+      rpc_read(conn, &A, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) ||
+      rpc_read(conn, &B, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &C, sizeof(double)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasDgeam(handle, transa, transb, m, n, &alpha, A,
+                                       lda, &beta, B, ldb, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(double)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDgeam_64(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int64_t m;
+  int64_t n;
+  double *alpha_null_check;
+  double alpha;
+  const double *A;
+  int64_t lda;
+  double *beta_null_check;
+  double beta;
+  const double *B;
+  int64_t ldb;
+  double C;
+  int64_t ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) ||
+      rpc_read(conn, &A, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const double *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const double)) < 0) ||
+      rpc_read(conn, &B, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &C, sizeof(double)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasDgeam_64(handle, transa, transb, m, n, &alpha,
+                                          A, lda, &beta, B, ldb, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(double)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCgeam(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int m;
+  int n;
+  cuComplex *alpha_null_check;
+  cuComplex alpha;
+  const cuComplex *A;
+  int lda;
+  cuComplex *beta_null_check;
+  cuComplex beta;
+  const cuComplex *B;
+  int ldb;
+  cuComplex C;
+  int ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasCgeam(handle, transa, transb, m, n, &alpha, A,
+                                       lda, &beta, B, ldb, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCgeam_64(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int64_t m;
+  int64_t n;
+  cuComplex *alpha_null_check;
+  cuComplex alpha;
+  const cuComplex *A;
+  int64_t lda;
+  cuComplex *beta_null_check;
+  cuComplex beta;
+  const cuComplex *B;
+  int64_t ldb;
+  cuComplex C;
+  int64_t ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuComplex *)) < 0 ||
+      (beta_null_check && rpc_read(conn, &beta, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &B, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, &alpha,
+                                          A, lda, &beta, B, ldb, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZgeam(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int m;
+  int n;
+  cuDoubleComplex *alpha_null_check;
+  cuDoubleComplex alpha;
+  const cuDoubleComplex *A;
+  int lda;
+  cuDoubleComplex *beta_null_check;
+  cuDoubleComplex beta;
+  const cuDoubleComplex *B;
+  int ldb;
+  cuDoubleComplex C;
+  int ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (beta_null_check &&
+       rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, &alpha, A,
+                                       lda, &beta, B, ldb, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZgeam_64(void *conn) {
+  cublasHandle_t handle;
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int64_t m;
+  int64_t n;
+  cuDoubleComplex *alpha_null_check;
+  cuDoubleComplex alpha;
+  const cuDoubleComplex *A;
+  int64_t lda;
+  cuDoubleComplex *beta_null_check;
+  cuDoubleComplex beta;
+  const cuDoubleComplex *B;
+  int64_t ldb;
+  cuDoubleComplex C;
+  int64_t ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (beta_null_check &&
+       rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, &alpha,
+                                          A, lda, &beta, B, ldb, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasStrsmBatched(void *conn) {
+  int batchCount;
+  cublasHandle_t handle;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasOperation_t trans;
+  cublasDiagType_t diag;
+  int m;
+  int n;
+  float *alpha_null_check;
+  float alpha;
+  const float **A = nullptr;
+  int lda;
+  float **B = nullptr;
+  int ldb;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
+      rpc_read(conn, &A, sizeof(const float *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &B, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasStrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda,
+                         B, ldb, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasStrsmBatched_64(void *conn) {
+  int64_t batchCount;
+  cublasHandle_t handle;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasOperation_t trans;
+  cublasDiagType_t diag;
+  int64_t m;
+  int64_t n;
+  float *alpha_null_check;
+  float alpha;
+  const float **A = nullptr;
+  int64_t lda;
+  float **B = nullptr;
+  int64_t ldb;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const float *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const float)) < 0) ||
+      rpc_read(conn, &A, sizeof(const float *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &B, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasStrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A,
+                            lda, B, ldb, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDtrsmBatched(void *conn) {
+  int batchCount;
+  cublasHandle_t handle;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasOperation_t trans;
+  cublasDiagType_t diag;
+  int m;
+  int n;
+  double *alpha_null_check;
+  double alpha;
+  const double **A = nullptr;
+  int lda;
+  double **B = nullptr;
+  int ldb;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) ||
+      rpc_read(conn, &A, sizeof(const double *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &B, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasDtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda,
+                         B, ldb, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDtrsmBatched_64(void *conn) {
+  int64_t batchCount;
+  cublasHandle_t handle;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasOperation_t trans;
+  cublasDiagType_t diag;
+  int64_t m;
+  int64_t n;
+  double *alpha_null_check;
+  double alpha;
+  const double **A = nullptr;
+  int64_t lda;
+  double **B = nullptr;
+  int64_t ldb;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const double *)) < 0 ||
+      (alpha_null_check && rpc_read(conn, &alpha, sizeof(const double)) < 0) ||
+      rpc_read(conn, &A, sizeof(const double *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &B, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasDtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A,
+                            lda, B, ldb, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCtrsmBatched(void *conn) {
+  int batchCount;
+  cublasHandle_t handle;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasOperation_t trans;
+  cublasDiagType_t diag;
+  int m;
+  int n;
+  cuComplex *alpha_null_check;
+  cuComplex alpha;
+  const cuComplex **A = nullptr;
+  int lda;
+  cuComplex **B = nullptr;
+  int ldb;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &B, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasCtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda,
+                         B, ldb, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCtrsmBatched_64(void *conn) {
+  int64_t batchCount;
+  cublasHandle_t handle;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasOperation_t trans;
+  cublasDiagType_t diag;
+  int64_t m;
+  int64_t n;
+  cuComplex *alpha_null_check;
+  cuComplex alpha;
+  const cuComplex **A = nullptr;
+  int64_t lda;
+  cuComplex **B = nullptr;
+  int64_t ldb;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &B, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasCtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A,
+                            lda, B, ldb, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZtrsmBatched(void *conn) {
+  int batchCount;
+  cublasHandle_t handle;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasOperation_t trans;
+  cublasDiagType_t diag;
+  int m;
+  int n;
+  cuDoubleComplex *alpha_null_check;
+  cuDoubleComplex alpha;
+  const cuDoubleComplex **A = nullptr;
+  int lda;
+  cuDoubleComplex **B = nullptr;
+  int ldb;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &B, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasZtrsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A, lda,
+                         B, ldb, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZtrsmBatched_64(void *conn) {
+  int64_t batchCount;
+  cublasHandle_t handle;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasOperation_t trans;
+  cublasDiagType_t diag;
+  int64_t m;
+  int64_t n;
+  cuDoubleComplex *alpha_null_check;
+  cuDoubleComplex alpha;
+  const cuDoubleComplex **A = nullptr;
+  int64_t lda;
+  cuDoubleComplex **B = nullptr;
+  int64_t ldb;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchCount, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &side, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &uplo, sizeof(cublasFillMode_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &diag, sizeof(cublasDiagType_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (alpha_null_check &&
+       rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &B, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasZtrsmBatched_64(handle, side, uplo, trans, diag, m, n, &alpha, A,
+                            lda, B, ldb, batchCount);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasSdgmm(void *conn) {
+  cublasHandle_t handle;
+  cublasSideMode_t mode;
+  int m;
+  int n;
+  const float *A;
+  int lda;
+  float *x_null_check;
+  float x;
+  int incx;
+  float C;
+  int ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &x_null_check, sizeof(const float *)) < 0 ||
+      (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) ||
+      rpc_read(conn, &incx, sizeof(int)) < 0 ||
+      rpc_read(conn, &C, sizeof(float)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasSdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasSdgmm_64(void *conn) {
+  cublasHandle_t handle;
+  cublasSideMode_t mode;
+  int64_t m;
+  int64_t n;
+  const float *A;
+  int64_t lda;
+  float *x_null_check;
+  float x;
+  int64_t incx;
+  float C;
+  int64_t ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &x_null_check, sizeof(const float *)) < 0 ||
+      (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) ||
+      rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &C, sizeof(float)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasSdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDdgmm(void *conn) {
+  cublasHandle_t handle;
+  cublasSideMode_t mode;
+  int m;
+  int n;
+  const double *A;
+  int lda;
+  double *x_null_check;
+  double x;
+  int incx;
+  double C;
+  int ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &x_null_check, sizeof(const double *)) < 0 ||
+      (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) ||
+      rpc_read(conn, &incx, sizeof(int)) < 0 ||
+      rpc_read(conn, &C, sizeof(double)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasDdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(double)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDdgmm_64(void *conn) {
+  cublasHandle_t handle;
+  cublasSideMode_t mode;
+  int64_t m;
+  int64_t n;
+  const double *A;
+  int64_t lda;
+  double *x_null_check;
+  double x;
+  int64_t incx;
+  double C;
+  int64_t ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &A, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &x_null_check, sizeof(const double *)) < 0 ||
+      (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) ||
+      rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &C, sizeof(double)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasDdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(double)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCdgmm(void *conn) {
+  cublasHandle_t handle;
+  cublasSideMode_t mode;
+  int m;
+  int n;
+  const cuComplex *A;
+  int lda;
+  cuComplex *x_null_check;
+  cuComplex x;
+  int incx;
+  cuComplex C;
+  int ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &x_null_check, sizeof(const cuComplex *)) < 0 ||
+      (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &incx, sizeof(int)) < 0 ||
+      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasCdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCdgmm_64(void *conn) {
+  cublasHandle_t handle;
+  cublasSideMode_t mode;
+  int64_t m;
+  int64_t n;
+  const cuComplex *A;
+  int64_t lda;
+  cuComplex *x_null_check;
+  cuComplex x;
+  int64_t incx;
+  cuComplex C;
+  int64_t ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &x_null_check, sizeof(const cuComplex *)) < 0 ||
+      (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) ||
+      rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasCdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZdgmm(void *conn) {
+  cublasHandle_t handle;
+  cublasSideMode_t mode;
+  int m;
+  int n;
+  const cuDoubleComplex *A;
+  int lda;
+  cuDoubleComplex *x_null_check;
+  cuDoubleComplex x;
+  int incx;
+  cuDoubleComplex C;
+  int ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &incx, sizeof(int)) < 0 ||
+      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasZdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZdgmm_64(void *conn) {
+  cublasHandle_t handle;
+  cublasSideMode_t mode;
+  int64_t m;
+  int64_t n;
+  const cuDoubleComplex *A;
+  int64_t lda;
+  cuDoubleComplex *x_null_check;
+  cuDoubleComplex x;
+  int64_t incx;
+  cuDoubleComplex C;
+  int64_t ldc;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
+      (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) ||
+      rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
+      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasZdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasSmatinvBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  int n;
+  const float **A = nullptr;
+  int lda;
+  float **Ainv = nullptr;
+  int lda_inv;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const float *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &Ainv, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &lda_inv, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasSmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDmatinvBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  int n;
+  const double **A = nullptr;
+  int lda;
+  double **Ainv = nullptr;
+  int lda_inv;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const double *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &Ainv, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &lda_inv, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasCgeam_64(handle, transa, transb, m, n, &alpha,
-                                          A, lda, &beta, B, ldb, &C, ldc);
+  scuda_intercept_result =
+      cublasDmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37174,51 +38905,35 @@ int handle_cublasCgeam_64(void *conn) {
   return -1;
 }
 
-int handle_cublasZgeam(void *conn) {
+int handle_cublasCmatinvBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasOperation_t transa;
-  cublasOperation_t transb;
-  int m;
   int n;
-  cuDoubleComplex *alpha_null_check;
-  cuDoubleComplex alpha;
-  const cuDoubleComplex *A;
+  const cuComplex **A = nullptr;
   int lda;
-  cuDoubleComplex *beta_null_check;
-  cuDoubleComplex beta;
-  const cuDoubleComplex *B;
-  int ldb;
-  cuDoubleComplex C;
-  int ldc;
+  cuComplex **Ainv = nullptr;
+  int lda_inv;
+  int info;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
-      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
-      rpc_read(conn, &m, sizeof(int)) < 0 ||
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
-      rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
-      (alpha_null_check &&
-       rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &A, sizeof(const cuComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
-      (beta_null_check &&
-       rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 ||
-      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
-      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+      rpc_read(conn, &Ainv, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &lda_inv, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasZgeam(handle, transa, transb, m, n, &alpha, A,
-                                       lda, &beta, B, ldb, &C, ldc);
+  scuda_intercept_result =
+      cublasCmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37227,51 +38942,35 @@ int handle_cublasZgeam(void *conn) {
   return -1;
 }
 
-int handle_cublasZgeam_64(void *conn) {
+int handle_cublasZmatinvBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasOperation_t transa;
-  cublasOperation_t transb;
-  int64_t m;
-  int64_t n;
-  cuDoubleComplex *alpha_null_check;
-  cuDoubleComplex alpha;
-  const cuDoubleComplex *A;
-  int64_t lda;
-  cuDoubleComplex *beta_null_check;
-  cuDoubleComplex beta;
-  const cuDoubleComplex *B;
-  int64_t ldb;
-  cuDoubleComplex C;
-  int64_t ldc;
+  int n;
+  const cuDoubleComplex **A = nullptr;
+  int lda;
+  cuDoubleComplex **Ainv = nullptr;
+  int lda_inv;
+  int info;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &transa, sizeof(cublasOperation_t)) < 0 ||
-      rpc_read(conn, &transb, sizeof(cublasOperation_t)) < 0 ||
-      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &alpha_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
-      (alpha_null_check &&
-       rpc_read(conn, &alpha, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
-      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &beta_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
-      (beta_null_check &&
-       rpc_read(conn, &beta, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &B, sizeof(const cuDoubleComplex *)) < 0 ||
-      rpc_read(conn, &ldb, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &Ainv, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &lda_inv, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result = cublasZgeam_64(handle, transa, transb, m, n, &alpha,
-                                          A, lda, &beta, B, ldb, &C, ldc);
+  scuda_intercept_result =
+      cublasZmatinvBatched(handle, n, A, lda, Ainv, lda_inv, &info, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37280,41 +38979,35 @@ int handle_cublasZgeam_64(void *conn) {
   return -1;
 }
 
-int handle_cublasSdgmm(void *conn) {
+int handle_cublasSgeqrfBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasSideMode_t mode;
   int m;
   int n;
-  const float *A;
+  float **Aarray = nullptr;
   int lda;
-  float *x_null_check;
-  float x;
-  int incx;
-  float C;
-  int ldc;
+  float **TauArray = nullptr;
+  int info;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
-      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(float *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &x_null_check, sizeof(const float *)) < 0 ||
-      (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) ||
-      rpc_read(conn, &incx, sizeof(int)) < 0 ||
-      rpc_read(conn, &C, sizeof(float)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+      rpc_read(conn, &TauArray, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result =
-      cublasSdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+  scuda_intercept_result = cublasSgeqrfBatched(handle, m, n, Aarray, lda,
+                                               TauArray, &info, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37323,41 +39016,35 @@ int handle_cublasSdgmm(void *conn) {
   return -1;
 }
 
-int handle_cublasSdgmm_64(void *conn) {
+int handle_cublasDgeqrfBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasSideMode_t mode;
-  int64_t m;
-  int64_t n;
-  const float *A;
-  int64_t lda;
-  float *x_null_check;
-  float x;
-  int64_t incx;
-  float C;
-  int64_t ldc;
+  int m;
+  int n;
+  double **Aarray = nullptr;
+  int lda;
+  double **TauArray = nullptr;
+  int info;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &A, sizeof(const float *)) < 0 ||
-      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &x_null_check, sizeof(const float *)) < 0 ||
-      (x_null_check && rpc_read(conn, &x, sizeof(const float)) < 0) ||
-      rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &C, sizeof(float)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &TauArray, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result =
-      cublasSdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+  scuda_intercept_result = cublasDgeqrfBatched(handle, m, n, Aarray, lda,
+                                               TauArray, &info, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(float)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37366,41 +39053,35 @@ int handle_cublasSdgmm_64(void *conn) {
   return -1;
 }
 
-int handle_cublasDdgmm(void *conn) {
+int handle_cublasCgeqrfBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasSideMode_t mode;
   int m;
   int n;
-  const double *A;
+  cuComplex **Aarray = nullptr;
   int lda;
-  double *x_null_check;
-  double x;
-  int incx;
-  double C;
-  int ldc;
+  cuComplex **TauArray = nullptr;
+  int info;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
-      rpc_read(conn, &A, sizeof(const double *)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(cuComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &x_null_check, sizeof(const double *)) < 0 ||
-      (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) ||
-      rpc_read(conn, &incx, sizeof(int)) < 0 ||
-      rpc_read(conn, &C, sizeof(double)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+      rpc_read(conn, &TauArray, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result =
-      cublasDdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+  scuda_intercept_result = cublasCgeqrfBatched(handle, m, n, Aarray, lda,
+                                               TauArray, &info, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(double)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37409,41 +39090,35 @@ int handle_cublasDdgmm(void *conn) {
   return -1;
 }
 
-int handle_cublasDdgmm_64(void *conn) {
+int handle_cublasZgeqrfBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasSideMode_t mode;
-  int64_t m;
-  int64_t n;
-  const double *A;
-  int64_t lda;
-  double *x_null_check;
-  double x;
-  int64_t incx;
-  double C;
-  int64_t ldc;
+  int m;
+  int n;
+  cuDoubleComplex **Aarray = nullptr;
+  int lda;
+  cuDoubleComplex **TauArray = nullptr;
+  int info;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &A, sizeof(const double *)) < 0 ||
-      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &x_null_check, sizeof(const double *)) < 0 ||
-      (x_null_check && rpc_read(conn, &x, sizeof(const double)) < 0) ||
-      rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &C, sizeof(double)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &TauArray, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
-  scuda_intercept_result =
-      cublasDdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+  scuda_intercept_result = cublasZgeqrfBatched(handle, m, n, Aarray, lda,
+                                               TauArray, &info, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(double)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37452,41 +39127,45 @@ int handle_cublasDdgmm_64(void *conn) {
   return -1;
 }
 
-int handle_cublasCdgmm(void *conn) {
+int handle_cublasSgelsBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasSideMode_t mode;
+  cublasOperation_t trans;
   int m;
   int n;
-  const cuComplex *A;
+  int nrhs;
+  float **Aarray = nullptr;
   int lda;
-  cuComplex *x_null_check;
-  cuComplex x;
-  int incx;
-  cuComplex C;
+  float **Carray = nullptr;
   int ldc;
+  int info;
+  int devInfoArray;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
-      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
+      rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(float *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &x_null_check, sizeof(const cuComplex *)) < 0 ||
-      (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &incx, sizeof(int)) < 0 ||
-      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 ||
+      rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
   scuda_intercept_result =
-      cublasCdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+      cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
+                         &info, &devInfoArray, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_write(conn, &devInfoArray, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37495,41 +39174,45 @@ int handle_cublasCdgmm(void *conn) {
   return -1;
 }
 
-int handle_cublasCdgmm_64(void *conn) {
+int handle_cublasDgelsBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasSideMode_t mode;
-  int64_t m;
-  int64_t n;
-  const cuComplex *A;
-  int64_t lda;
-  cuComplex *x_null_check;
-  cuComplex x;
-  int64_t incx;
-  cuComplex C;
-  int64_t ldc;
+  cublasOperation_t trans;
+  int m;
+  int n;
+  int nrhs;
+  double **Aarray = nullptr;
+  int lda;
+  double **Carray = nullptr;
+  int ldc;
+  int info;
+  int devInfoArray;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &A, sizeof(const cuComplex *)) < 0 ||
-      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &x_null_check, sizeof(const cuComplex *)) < 0 ||
-      (x_null_check && rpc_read(conn, &x, sizeof(const cuComplex)) < 0) ||
-      rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &C, sizeof(cuComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &Carray, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 ||
+      rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
   scuda_intercept_result =
-      cublasCdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+      cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
+                         &info, &devInfoArray, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuComplex)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_write(conn, &devInfoArray, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37538,41 +39221,45 @@ int handle_cublasCdgmm_64(void *conn) {
   return -1;
 }
 
-int handle_cublasZdgmm(void *conn) {
+int handle_cublasCgelsBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasSideMode_t mode;
+  cublasOperation_t trans;
   int m;
-  int n;
-  const cuDoubleComplex *A;
-  int lda;
-  cuDoubleComplex *x_null_check;
-  cuDoubleComplex x;
-  int incx;
-  cuDoubleComplex C;
+  int n;
+  int nrhs;
+  cuComplex **Aarray = nullptr;
+  int lda;
+  cuComplex **Carray = nullptr;
   int ldc;
+  int info;
+  int devInfoArray;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
       rpc_read(conn, &m, sizeof(int)) < 0 ||
       rpc_read(conn, &n, sizeof(int)) < 0 ||
-      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
+      rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(cuComplex *const *)) < 0 ||
       rpc_read(conn, &lda, sizeof(int)) < 0 ||
-      rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
-      (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &incx, sizeof(int)) < 0 ||
-      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int)) < 0 || false)
+      rpc_read(conn, &Carray, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 ||
+      rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
   scuda_intercept_result =
-      cublasZdgmm(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+      cublasCgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
+                         &info, &devInfoArray, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_write(conn, &devInfoArray, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37581,41 +39268,45 @@ int handle_cublasZdgmm(void *conn) {
   return -1;
 }
 
-int handle_cublasZdgmm_64(void *conn) {
+int handle_cublasZgelsBatched(void *conn) {
+  int batchSize;
   cublasHandle_t handle;
-  cublasSideMode_t mode;
-  int64_t m;
-  int64_t n;
-  const cuDoubleComplex *A;
-  int64_t lda;
-  cuDoubleComplex *x_null_check;
-  cuDoubleComplex x;
-  int64_t incx;
-  cuDoubleComplex C;
-  int64_t ldc;
+  cublasOperation_t trans;
+  int m;
+  int n;
+  int nrhs;
+  cuDoubleComplex **Aarray = nullptr;
+  int lda;
+  cuDoubleComplex **Carray = nullptr;
+  int ldc;
+  int info;
+  int devInfoArray;
   int request_id;
   cublasStatus_t scuda_intercept_result;
-  if (rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
-      rpc_read(conn, &mode, sizeof(cublasSideMode_t)) < 0 ||
-      rpc_read(conn, &m, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &n, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &A, sizeof(const cuDoubleComplex *)) < 0 ||
-      rpc_read(conn, &lda, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &x_null_check, sizeof(const cuDoubleComplex *)) < 0 ||
-      (x_null_check && rpc_read(conn, &x, sizeof(const cuDoubleComplex)) < 0) ||
-      rpc_read(conn, &incx, sizeof(int64_t)) < 0 ||
-      rpc_read(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
-      rpc_read(conn, &ldc, sizeof(int64_t)) < 0 || false)
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &m, sizeof(int)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &Carray, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 ||
+      rpc_read(conn, &devInfoArray, sizeof(int)) < 0 || false)
     goto ERROR_0;
 
   request_id = rpc_end_request(conn);
   if (request_id < 0)
     goto ERROR_0;
   scuda_intercept_result =
-      cublasZdgmm_64(handle, mode, m, n, A, lda, &x, incx, &C, ldc);
+      cublasZgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
+                         &info, &devInfoArray, batchSize);
 
   if (rpc_start_response(conn, request_id) < 0 ||
-      rpc_write(conn, &C, sizeof(cuDoubleComplex)) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_write(conn, &devInfoArray, sizeof(int)) < 0 ||
       rpc_end_response(conn, &scuda_intercept_result) < 0)
     goto ERROR_0;
 
@@ -37897,6 +39588,352 @@ int handle_cublasZtrttp(void *conn) {
   return -1;
 }
 
+int handle_cublasSgetriBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  int n;
+  const float **A = nullptr;
+  int lda;
+  int *P_null_check;
+  int P;
+  float **C = nullptr;
+  int ldc;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const float *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &P_null_check, sizeof(const int *)) < 0 ||
+      (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) ||
+      rpc_read(conn, &C, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasSgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDgetriBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  int n;
+  const double **A = nullptr;
+  int lda;
+  int *P_null_check;
+  int P;
+  double **C = nullptr;
+  int ldc;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const double *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &P_null_check, sizeof(const int *)) < 0 ||
+      (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) ||
+      rpc_read(conn, &C, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasDgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCgetriBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  int n;
+  const cuComplex **A = nullptr;
+  int lda;
+  int *P_null_check;
+  int P;
+  cuComplex **C = nullptr;
+  int ldc;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const cuComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &P_null_check, sizeof(const int *)) < 0 ||
+      (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) ||
+      rpc_read(conn, &C, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasCgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZgetriBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  int n;
+  const cuDoubleComplex **A = nullptr;
+  int lda;
+  int *P_null_check;
+  int P;
+  cuDoubleComplex **C = nullptr;
+  int ldc;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &A, sizeof(const cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &P_null_check, sizeof(const int *)) < 0 ||
+      (P_null_check && rpc_read(conn, &P, sizeof(const int)) < 0) ||
+      rpc_read(conn, &C, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &ldc, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasZgetriBatched(handle, n, A, lda, &P, C, ldc, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasSgetrsBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  cublasOperation_t trans;
+  int n;
+  int nrhs;
+  const float **Aarray = nullptr;
+  int lda;
+  int *devIpiv_null_check;
+  int devIpiv;
+  float **Barray = nullptr;
+  int ldb;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const float *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &devIpiv_null_check, sizeof(const int *)) < 0 ||
+      (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) ||
+      rpc_read(conn, &Barray, sizeof(float *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasSgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray,
+                          ldb, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasDgetrsBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  cublasOperation_t trans;
+  int n;
+  int nrhs;
+  const double **Aarray = nullptr;
+  int lda;
+  int *devIpiv_null_check;
+  int devIpiv;
+  double **Barray = nullptr;
+  int ldb;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const double *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &devIpiv_null_check, sizeof(const int *)) < 0 ||
+      (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) ||
+      rpc_read(conn, &Barray, sizeof(double *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasDgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray,
+                          ldb, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasCgetrsBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  cublasOperation_t trans;
+  int n;
+  int nrhs;
+  const cuComplex **Aarray = nullptr;
+  int lda;
+  const int *devIpiv;
+  cuComplex **Barray = nullptr;
+  int ldb;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const cuComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &devIpiv, sizeof(const int *)) < 0 ||
+      rpc_read(conn, &Barray, sizeof(cuComplex *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasCgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray,
+                          ldb, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
+int handle_cublasZgetrsBatched(void *conn) {
+  int batchSize;
+  cublasHandle_t handle;
+  cublasOperation_t trans;
+  int n;
+  int nrhs;
+  const cuDoubleComplex **Aarray = nullptr;
+  int lda;
+  int *devIpiv_null_check;
+  int devIpiv;
+  cuDoubleComplex **Barray = nullptr;
+  int ldb;
+  int info;
+  int request_id;
+  cublasStatus_t scuda_intercept_result;
+  if (rpc_read(conn, &batchSize, sizeof(int)) < 0 ||
+      rpc_read(conn, &handle, sizeof(cublasHandle_t)) < 0 ||
+      rpc_read(conn, &trans, sizeof(cublasOperation_t)) < 0 ||
+      rpc_read(conn, &n, sizeof(int)) < 0 ||
+      rpc_read(conn, &nrhs, sizeof(int)) < 0 ||
+      rpc_read(conn, &Aarray, sizeof(const cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &lda, sizeof(int)) < 0 ||
+      rpc_read(conn, &devIpiv_null_check, sizeof(const int *)) < 0 ||
+      (devIpiv_null_check && rpc_read(conn, &devIpiv, sizeof(const int)) < 0) ||
+      rpc_read(conn, &Barray, sizeof(cuDoubleComplex *const *)) < 0 ||
+      rpc_read(conn, &ldb, sizeof(int)) < 0 ||
+      rpc_read(conn, &info, sizeof(int)) < 0 || false)
+    goto ERROR_0;
+
+  request_id = rpc_end_request(conn);
+  if (request_id < 0)
+    goto ERROR_0;
+  scuda_intercept_result =
+      cublasZgetrsBatched(handle, trans, n, nrhs, Aarray, lda, &devIpiv, Barray,
+                          ldb, &info, batchSize);
+
+  if (rpc_start_response(conn, request_id) < 0 ||
+      rpc_write(conn, &info, sizeof(int)) < 0 ||
+      rpc_end_response(conn, &scuda_intercept_result) < 0)
+    goto ERROR_0;
+
+  return 0;
+ERROR_0:
+  return -1;
+}
+
 int handle_cublasUint8gemmBias(void *conn) {
   cublasHandle_t handle;
   cublasOperation_t transa;
@@ -41188,6 +43225,8 @@ static RequestHandler opHandlers[] = {
     handle_cublasChpr2_v2_64,
     handle_cublasZhpr2_v2,
     handle_cublasZhpr2_v2_64,
+    handle_cublasSgemvBatched,
+    handle_cublasTSTgemvBatched,
     handle_cublasSgemvStridedBatched,
     handle_cublasSgemvStridedBatched_64,
     handle_cublasDgemvStridedBatched,
@@ -41282,6 +43321,18 @@ static RequestHandler opHandlers[] = {
     handle_cublasCtrmm_v2_64,
     handle_cublasZtrmm_v2,
     handle_cublasZtrmm_v2_64,
+    handle_cublasHgemmBatched,
+    handle_cublasHgemmBatched_64,
+    handle_cublasSgemmBatched,
+    handle_cublasSgemmBatched_64,
+    handle_cublasDgemmBatched,
+    handle_cublasDgemmBatched_64,
+    handle_cublasCgemmBatched,
+    handle_cublasCgemmBatched_64,
+    handle_cublasCgemm3mBatched,
+    handle_cublasCgemm3mBatched_64,
+    handle_cublasZgemmBatched,
+    handle_cublasZgemmBatched_64,
     handle_cublasHgemmStridedBatched,
     handle_cublasHgemmStridedBatched_64,
     handle_cublasSgemmStridedBatched,
@@ -41295,6 +43346,7 @@ static RequestHandler opHandlers[] = {
     handle_cublasZgemmStridedBatched,
     handle_cublasZgemmStridedBatched_64,
     nullptr,
+    handle_cublasGemmBatchedEx_64,
     handle_cublasSgeam,
     handle_cublasSgeam_64,
     handle_cublasDgeam,
@@ -41303,6 +43355,14 @@ static RequestHandler opHandlers[] = {
     handle_cublasCgeam_64,
     handle_cublasZgeam,
     handle_cublasZgeam_64,
+    handle_cublasStrsmBatched,
+    handle_cublasStrsmBatched_64,
+    handle_cublasDtrsmBatched,
+    handle_cublasDtrsmBatched_64,
+    handle_cublasCtrsmBatched,
+    handle_cublasCtrsmBatched_64,
+    handle_cublasZtrsmBatched,
+    handle_cublasZtrsmBatched_64,
     handle_cublasSdgmm,
     handle_cublasSdgmm_64,
     handle_cublasDdgmm,
@@ -41311,6 +43371,18 @@ static RequestHandler opHandlers[] = {
     handle_cublasCdgmm_64,
     handle_cublasZdgmm,
     handle_cublasZdgmm_64,
+    handle_cublasSmatinvBatched,
+    handle_cublasDmatinvBatched,
+    handle_cublasCmatinvBatched,
+    handle_cublasZmatinvBatched,
+    handle_cublasSgeqrfBatched,
+    handle_cublasDgeqrfBatched,
+    handle_cublasCgeqrfBatched,
+    handle_cublasZgeqrfBatched,
+    handle_cublasSgelsBatched,
+    handle_cublasDgelsBatched,
+    handle_cublasCgelsBatched,
+    handle_cublasZgelsBatched,
     handle_cublasStpttr,
     handle_cublasDtpttr,
     handle_cublasCtpttr,
@@ -41319,6 +43391,14 @@ static RequestHandler opHandlers[] = {
     handle_cublasDtrttp,
     handle_cublasCtrttp,
     handle_cublasZtrttp,
+    handle_cublasSgetriBatched,
+    handle_cublasDgetriBatched,
+    handle_cublasCgetriBatched,
+    handle_cublasZgetriBatched,
+    handle_cublasSgetrsBatched,
+    handle_cublasDgetrsBatched,
+    handle_cublasCgetrsBatched,
+    handle_cublasZgetrsBatched,
     handle_cublasUint8gemmBias,
     nullptr,
     nullptr,