-
Notifications
You must be signed in to change notification settings - Fork 0
/
radcos_template.txt
470 lines (369 loc) · 16.8 KB
/
radcos_template.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
################################
## Cluster Configuration File ##
################################
[cluster radcos_cluster]
FormLayout = selectionpanel
Category = Schedulers
Autoscale = $Autoscale
[[node defaults]]
UsePublicNetwork = $UsePublicNetwork
Credentials = $Credentials
SubnetId = $SubnetId
Region = $Region
KeyPairLocation = ~/.ssh/cyclecloud.pem
Azure.Identities = /subscriptions/f0328804-fee6-481a-aba3-97864907535b/resourceGroups/uwradcos_tools/providers/Microsoft.ManagedIdentity/userAssignedIdentities/radcos_id
# Slurm autoscaling supports both Terminate and Deallocate shutdown policies
ShutdownPolicy = $configuration_slurm_shutdown_policy
[[[configuration]]]
slurm.version = $configuration_slurm_version
slurm.accounting.enabled = $configuration_slurm_accounting_enabled
slurm.accounting.url = $configuration_slurm_accounting_url
slurm.accounting.user = $configuration_slurm_accounting_user
slurm.accounting.password = $configuration_slurm_accounting_password
# For fast spin-up after Deallocate, force an immediate re-converge on boot
cyclecloud.converge_on_boot = true
# Disable normal NFS exports and mounts
cyclecloud.mounts.sched.disabled = true
cyclecloud.mounts.shared.disabled = true
cyclecloud.exports.sched.disabled = true
cyclecloud.exports.shared.disabled = true
cyclecloud.exports.sched.samba.enabled = false
cyclecloud.exports.shared.samba.enabled = false
cyclecloud.exports.defaults.samba.enabled = false
cshared.server.legacy_links_disabled = true
[[[cluster-init cyclecloud/slurm:default]]]
Optional = true
[[[configuration cyclecloud.mounts.nfs_shared]]]
type = nfs
mountpoint = /shared
export_path = $NFSSharedExportPath
address = $NFSAddress
options = $NFSSharedMountOptions
[[[configuration cyclecloud.mounts.nfs_sched]]]
type = nfs
mountpoint = /sched
[[[configuration cyclecloud.mounts.additional_nfs]]]
disabled = ${AdditionalNAS isnt true}
type = nfs
address = $AdditonalNFSAddress
mountpoint = $AdditionalNFSMountPoint
export_path = $AdditionalNFSExportPath
options = $AdditionalNFSMountOptions
[[node scheduler]]
MachineType = $SchedulerMachineType
ImageName = $SchedulerImageName
IsReturnProxy = $ReturnProxy
AdditionalClusterInitSpecs = $SchedulerClusterInitSpecs
[[[configuration]]]
cyclecloud.mounts.nfs_sched.disabled = true
cyclecloud.mounts.nfs_shared.disabled = ${NFSType != "External"}
[[[cluster-init cyclecloud/slurm:scheduler]]]
[[[network-interface eth0]]]
AssociatePublicIpAddress = $UsePublicNetwork
[[[input-endpoint ganglia]]]
PrivatePort = 8652
PublicPort = 8652
[[[volume sched]]]
Size = 1024
SSD = True
Mount = builtinsched
Persistent = False
[[[volume shared]]]
Size = ${ifThenElse(NFSType == "Builtin", FilesystemSize, 2)}
SSD = True
Mount = builtinshared
Persistent = ${NFSType == "Builtin"}
[[[configuration cyclecloud.mounts.builtinsched]]]
mountpoint = /sched
fs_type = xfs
[[[configuration cyclecloud.mounts.builtinshared]]]
disabled = ${NFSType != "Builtin"}
mountpoint = /shared
fs_type = xfs
[[[configuration cyclecloud.exports.builtinsched]]]
export_path = /sched
options = no_root_squash
samba.enabled = false
type = nfs
[[[configuration cyclecloud.exports.builtinshared]]]
disabled = ${NFSType != "Builtin"}
export_path = /shared
samba.enabled = false
type = nfs
[[nodearray hpc]]
MachineType = $HPCMachineType
ImageName = $HPCImageName
MaxCoreCount = $MaxHPCExecuteCoreCount
Azure.MaxScalesetSize = $HPCMaxScalesetSize
AdditionalClusterInitSpecs = $HPCClusterInitSpecs
[[[configuration]]]
slurm.autoscale = true
slurm.default_partition = true
slurm.hpc = true
[[[cluster-init cyclecloud/slurm:execute]]]
[[[network-interface eth0]]]
AssociatePublicIpAddress = $ExecuteNodesPublic
[[nodearray htc]]
MachineType = $HTCMachineType
ImageName = $HTCImageName
MaxCoreCount = $MaxHTCExecuteCoreCount
Interruptible = $HTCUseLowPrio
MaxPrice = $HTCSpotMaxPrice
AdditionalClusterInitSpecs = $HTCClusterInitSpecs
[[[configuration]]]
slurm.autoscale = true
slurm.hpc = false
[[[cluster-init cyclecloud/slurm:execute]]]
[[[network-interface eth0]]]
AssociatePublicIpAddress = $ExecuteNodesPublic
[parameters About]
Order = 1
[[parameters About Slurm]]
[[[parameter slurm]]]
HideLabel = true
Config.Plugin = pico.widget.HtmlTemplateWidget
Config.Template := "<table role=\"presentation\"><tr><td><img alt=\"Slurm icon\" src='static/cloud/cluster/ui/ClusterIcon/slurm.png' width='192' height='192'></td></tr><tr><td><p>Slurm is a highly configurable open source workload manager. See the <a href=\"https://www.schedmd.com/\" target=\"_blank\">Slurm project site</a> for an overview.</p><p>Follow the instructions in the <a href=\"https://github.com/azure/cyclecloud-slurm/\" target=\"_blank\">README</a> for details on instructions on extending and configuring the Project for your environment.</p></td></tr></table>"
[parameters Required Settings]
Order = 10
[[parameters Virtual Machines ]]
Description = "The cluster, in this case, has two roles: the scheduler node with shared filer and the execute hosts. Configure which VM types to use based on the requirements of your application."
Order = 20
[[[parameter Region]]]
Label = Region
Description = Deployment Location
ParameterType = Cloud.Region
[[[parameter SchedulerMachineType]]]
Label = Scheduler VM Type
Description = The VM type for scheduler node
ParameterType = Cloud.MachineType
DefaultValue = Standard_D12_v2
[[[parameter HPCMachineType]]]
Label = HPC VM Type
Description = The VM type for HPC execute nodes
ParameterType = Cloud.MachineType
DefaultValue = Standard_F2s_v2
[[[parameter HTCMachineType]]]
Label = HTC VM Type
Description = The VM type for HTC execute nodes
ParameterType = Cloud.MachineType
DefaultValue = Standard_F2s_v2
[[parameters Auto-Scaling]]
Description = "The cluster can autoscale to the workload, adding execute hosts as jobs are queued. To enable this check the box below and choose the initial and maximum core counts for the cluster"
Order = 30
[[[parameter Autoscale]]]
Label = Autoscale
DefaultValue = true
Widget.Plugin = pico.form.BooleanCheckBox
Widget.Label = Start and stop execute instances automatically
[[[parameter MaxHPCExecuteCoreCount]]]
Label = Max HPC Cores
Description = The total number of HPC execute cores to start
DefaultValue = 100
Config.Plugin = pico.form.NumberTextBox
Config.MinValue = 1
Config.IntegerOnly = true
[[[parameter MaxHTCExecuteCoreCount]]]
Label = Max HTC Cores
Description = The total number of HTC execute cores to start
DefaultValue = 100
Config.Plugin = pico.form.NumberTextBox
Config.MinValue = 1
Config.IntegerOnly = true
[[[parameter HPCMaxScalesetSize]]]
Label = Max VMs per Scaleset
Description = The maximum number of VMs created per VM Scaleset e.g. switch in Slurm.
DefaultValue = 40
Config.Plugin = pico.form.NumberTextBox
Config.MinValue = 1
Config.IntegerOnly = true
[[[parameter HTCUseLowPrio]]]
Label = Spot
DefaultValue = false
Widget.Plugin = pico.form.BooleanCheckBox
Widget.Label = Use Spot VMs for HTC execute hosts
[[[parameter HTCSpotMaxPrice]]]
Label = Max Price
DefaultValue = -1
Description = Max price for Spot VMs in USD (value of -1 will not evict based on price)
Config.Plugin = pico.form.NumberTextBox
Conditions.Excluded := HTCUseLowPrio isnt true
Config.MinValue = -1
[[parameters Networking]]
Order = 40
[[[parameter SubnetId]]]
Label = Subnet ID
Description = Subnet Resource Path (ResourceGroup/VirtualNetwork/Subnet)
ParameterType = Azure.Subnet
Required = True
[parameters Network Attached Storage]
Order = 15
[[parameters Default NFS Share]]
Order = 10
[[[parameter About shared]]]
HideLabel = true
Config.Plugin = pico.widget.HtmlTemplateWidget
Config.Template := "<p>The directory <code>/shared</code> is a network attached mount and exists in all nodes of the cluster. Users' home directories reside within this mountpoint with the base homedir <code>/shared/home</code>.<br><br>There are two options for providing this mount:<br> <strong>[Builtin]</strong>: The scheduler node is an NFS server that provides the mountpoint to the other nodes of the cluster.<br> <strong>[External NFS]</strong>: A network attached storage such as Azure Netapp Files, HPC Cache, or another VM running an NFS server, provides the mountpoint.</p>"
Order = 20
[[[parameter NFSType]]]
Label = NFS Type
ParameterType = StringList
Config.Label = Type of NFS to use for this cluster
Config.Plugin = pico.form.Dropdown
Config.Entries := {[Label="External NFS"; Value="External"], [Label="Builtin"; Value="Builtin"]}
DefaultValue = Builtin
[[[parameter NFSAddress]]]
Label = NFS IP Address
Description = The IP address or hostname of the NFS server. Also accepts a list comma-separated addresses, for example, to mount a frontend load-balanced Azure HPC Cache.
Config.ParameterType = String
Conditions.Hidden := NFSType != "External"
[[[parameter NFSSharedExportPath]]]
Label = Shared Export Path
Description = The path exported by the file system
DefaultValue = /shared
Conditions.Hidden := NFSType != "External"
[[[parameter NFSSharedMountOptions]]]
Label = NFS Mount Options
Description = NFS Client Mount Options
Conditions.Hidden := NFSType != "External"
[[[parameter FilesystemSize]]]
Label = Size (GB)
Description = The filesystem size
DefaultValue = 100
Config.Plugin = pico.form.NumberTextBox
Config.MinValue = 10
Config.MaxValue = 10240
Config.IntegerOnly = true
Conditions.Excluded := NFSType != "Builtin"
[[parameters Additional NFS Mount]]
Order = 20
[[[parameter Additional NFS Mount Readme]]]
HideLabel = true
Config.Plugin = pico.widget.HtmlTemplateWidget
Config.Template := "<p>Mount another NFS endpoint on the cluster nodes</p>"
Order = 20
[[[parameter AdditionalNAS]]]
HideLabel = true
DefaultValue = false
Widget.Plugin = pico.form.BooleanCheckBox
Widget.Label = Add NFS mount
[[[parameter AdditonalNFSAddress]]]
Label = NFS IP Address
Description = The IP address or hostname of the NFS server. Also accepts a list comma-separated addresses, for example, to mount a frontend load-balanced Azure HPC Cache.
Config.ParameterType = String
Conditions.Excluded := AdditionalNAS isnt true
[[[parameter AdditionalNFSMountPoint]]]
Label = NFS Mount Point
Description = The path at which to mount the Filesystem
DefaultValue = /data
Conditions.Excluded := AdditionalNAS isnt true
[[[parameter AdditionalNFSExportPath]]]
Label = NFS Export Path
Description = The path exported by the file system
DefaultValue = /data
Conditions.Excluded := AdditionalNAS isnt true
[[[parameter AdditionalNFSMountOptions]]]
Label = NFS Mount Options
Description = NFS Client Mount Options
Conditions.Excluded := AdditionalNAS isnt true
[parameters Advanced Settings]
Order = 20
[[parameters Azure Settings]]
Order = 10
[[[parameter Credentials]]]
Description = The credentials for the cloud provider
ParameterType = Cloud.Credentials
[[parameters Slurm Settings ]]
Description = "Section for configuring Slurm"
Order = 5
[[[parameter configuration_slurm_version]]]
Required = True
Label = Slurm Version
Description = Version of Slurm to install on the cluster
ParameterType = StringList
Config.Plugin = pico.form.Dropdown
Config.FreeForm = true
Config.Entries := {[Value="19.05.8-1"], [Value="20.11.3-1"]}
DefaultValue = 19.05.8-1
[[[parameter configuration_slurm_accounting_enabled]]]
Label = Job Accounting
DefaultValue = false
Widget.Plugin = pico.form.BooleanCheckBox
Widget.Label = Configure Slurm job accounting
[[[parameter configuration_slurm_accounting_url]]]
Label = Slurm DBD URL
Description = URL of the database to use for Slurm job accounting
Conditions.Excluded := configuration_slurm_accounting_enabled isnt true
[[[parameter configuration_slurm_accounting_user]]]
Label = Slurm DBD User
Description = User for Slurm DBD admin
Conditions.Excluded := configuration_slurm_accounting_enabled isnt true
[[[parameter configuration_slurm_accounting_password]]]
Label = Slurm DBD Password
Description = Password for Slurm DBD admin
ParameterType = Password
Conditions.Excluded := configuration_slurm_accounting_enabled isnt true
[[[parameter configuration_slurm_shutdown_policy]]]
Label = ShutdownPolicy
description = By default, autostop will Delete stopped VMS for lowest cost. Optionally, Stop/Deallocate the VMs for faster restart instead.
DefaultValue = Terminate
config.plugin = pico.control.AutoCompleteDropdown
[[[[list Config.Entries]]]]
Name = Terminate
Label = Terminate
[[[[list Config.Entries]]]]
Name = Deallocate
Label = Deallocate
[[parameters Software]]
Description = "Specify the scheduling software, and base OS installed on all nodes, and optionally the cluster-init and chef versions from your Locker."
Order = 10
[[[parameter SchedulerImageName]]]
Label = Scheduler OS
ParameterType = Cloud.Image
Config.OS = linux
DefaultValue = cycle.image.centos7
Config.Filter := Package in {"cycle.image.centos7", "cycle.image.ubuntu18"}
[[[parameter HPCImageName]]]
Label = HPC OS
ParameterType = Cloud.Image
Config.OS = linux
DefaultValue = cycle.image.centos7
Config.Filter := Package in {"cycle.image.centos7", "cycle.image.ubuntu18"}
[[[parameter HTCImageName]]]
Label = HTC OS
ParameterType = Cloud.Image
Config.OS = linux
DefaultValue = cycle.image.centos7
Config.Filter := Package in {"cycle.image.centos7", "cycle.image.ubuntu18"}
[[[parameter SchedulerClusterInitSpecs]]]
Label = Scheduler Cluster-Init
DefaultValue = =undefined
Description = Cluster init specs to apply to the scheduler node
ParameterType = Cloud.ClusterInitSpecs
[[[parameter HTCClusterInitSpecs]]]
Label = HTC Cluster-Init
DefaultValue = =undefined
Description = Cluster init specs to apply to HTC execute nodes
ParameterType = Cloud.ClusterInitSpecs
[[[parameter HPCClusterInitSpecs]]]
Label = HPC Cluster-Init
DefaultValue = =undefined
Description = Cluster init specs to apply to HPC execute nodes
ParameterType = Cloud.ClusterInitSpecs
[[parameters Advanced Networking]]
Description = Advanced networking settings
[[[parameter ReturnProxy]]]
Label = Return Proxy
DefaultValue = true
ParameterType = Boolean
Config.Label = Use SSH tunnel to connect to CycleCloud (required if direct access is blocked)
[[[parameter UsePublicNetwork]]]
Label = Public Head Node
DefaultValue = true
ParameterType = Boolean
Config.Label = Access scheduler node from the Internet
[[[parameter ExecuteNodesPublic]]]
Label = Public Execute
DefaultValue = false
ParameterType = Boolean
Config.Label = Access execute nodes from the Internet
Conditions.Excluded := UsePublicNetwork isnt true