diff --git a/dashboard/src/lib/porter-apps/services.ts b/dashboard/src/lib/porter-apps/services.ts index b969448b65..28fae78bba 100644 --- a/dashboard/src/lib/porter-apps/services.ts +++ b/dashboard/src/lib/porter-apps/services.ts @@ -82,6 +82,10 @@ export const serviceValidator = z.object({ cpuCores: serviceNumberValidator, ramMegabytes: serviceNumberValidator, gpuCoresNvidia: serviceNumberValidator, + gpu: z.object({ + enabled: serviceBooleanValidator, + gpuCoresNvidia: serviceNumberValidator, + }), smartOptimization: serviceBooleanValidator.optional(), terminationGracePeriodSeconds: serviceNumberValidator.optional(), config: z.discriminatedUnion("type", [ @@ -117,6 +121,10 @@ export type SerializedService = { ramMegabytes: number; smartOptimization?: boolean; gpuCoresNvidia: number; + gpu: { + enabled: boolean; + gpuCoresNvidia: number; + }; terminationGracePeriodSeconds?: number; config: | { @@ -196,6 +204,10 @@ export function defaultSerialized({ cpuCores: defaultCPU, ramMegabytes: defaultRAM, gpuCoresNvidia: 0, + gpu: { + enabled: false, + gpuCoresNvidia: 0, + }, smartOptimization: true, }; @@ -264,6 +276,10 @@ export function serializeService(service: ClientService): SerializedService { ramMegabytes: Math.round(service.ramMegabytes.value), // RAM must be an integer smartOptimization: service.smartOptimization?.value, gpuCoresNvidia: service.gpuCoresNvidia.value, + gpu: { + enabled: service.gpu.enabled.value, + gpuCoresNvidia: service.gpu.gpuCoresNvidia.value, + }, terminationGracePeriodSeconds: service.terminationGracePeriodSeconds?.value, config: match(service.config) .with({ type: "web" }, (config) => @@ -336,6 +352,16 @@ export function deserializeService({ instances: ServiceField.number(service.instances, override?.instances), port: ServiceField.number(service.port, override?.port), cpuCores: ServiceField.number(service.cpuCores, override?.cpuCores), + gpu: { + enabled: ServiceField.boolean( + service.gpu?.enabled, + override?.gpu.enabled + ), + gpuCoresNvidia: ServiceField.number( + service.gpu?.gpuCoresNvidia, + override?.gpu?.gpuCoresNvidia + ), + }, gpuCoresNvidia: ServiceField.number( service.gpuCoresNvidia, override?.gpuCoresNvidia diff --git a/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/ServiceContainer.tsx b/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/ServiceContainer.tsx index b6d2f991ed..c19d05c1bb 100644 --- a/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/ServiceContainer.tsx +++ b/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/ServiceContainer.tsx @@ -154,7 +154,7 @@ const ServiceContainer: React.FC = ({ {service.name.value.trim().length > 0 ? service.name.value : "New Service"} - {service.gpuCoresNvidia.value > 0 && ( + {service.gpu.enabled.value && ( <> @@ -276,7 +276,7 @@ const ServiceHeader = styled.div<{ border-radius: 20px; margin-left: -10px; transform: ${(props: { showExpanded?: boolean }) => - props.showExpanded ? "" : "rotate(-90deg)"}; + props.showExpanded ? "" : "rotate(-90deg)"}; } `; diff --git a/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/Resources.tsx b/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/Resources.tsx index 72a7d37728..1fe632b9c4 100644 --- a/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/Resources.tsx +++ b/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/Resources.tsx @@ -227,7 +227,7 @@ const Resources: React.FC = ({ <> ( <> @@ -235,19 +235,20 @@ const Resources: React.FC = ({ 0} + checked={value.enabled.value} disabled={!clusterContainsGPUNodes} onChange={() => { - if (value.value > 0) { - onChange({ - ...value, - value: 0, - }); - } else - onChange({ - ...value, - value: 1, - }); + onChange({ + ...value, + enabled: { + ...value.enabled, + value: !value.enabled.value, + }, + gpuCoresNvidia: { + ...value.gpuCoresNvidia, + value: value.enabled.value ? 0 : 1, + } + }); }} inputProps={{ "aria-label": "controlled" }} /> @@ -264,7 +265,7 @@ const Resources: React.FC = ({ You cluster has no GPU nodes available. - + {currentCluster.status !== "UPDATING" && { setClusterModalVisible(true); @@ -273,7 +274,7 @@ const Resources: React.FC = ({ Add GPU nodes - + } )} @@ -290,23 +291,22 @@ const Resources: React.FC = ({ )} /> - {currentCluster.status === "UPDATING" && - clusterContainsGPUNodes && ( - - - - - {"Creating GPU nodes..."} - - - - - View Status - - - - - )} + {(currentCluster.status === "UPDATING" && !clusterContainsGPUNodes) && ( + + + + + {"Cluster is updating..."} + + + + + View Status + + + + + )} )} {match(service.config) diff --git a/internal/porter_app/test/parse_test.go b/internal/porter_app/test/parse_test.go index 99906cffd4..37250ad0ce 100644 --- a/internal/porter_app/test/parse_test.go +++ b/internal/porter_app/test/parse_test.go @@ -54,6 +54,10 @@ var result_nobuild = &porterv1.PorterApp{ Port: 8080, CpuCores: 0.1, RamMegabytes: 256, + Gpu: &porterv1.GPU{ + Enabled: false, + GpuCoresNvidia: 0, + }, Config: &porterv1.Service_WebConfig{ WebConfig: &porterv1.WebServiceConfig{ Autoscaling: &porterv1.Autoscaling{ @@ -87,6 +91,10 @@ var result_nobuild = &porterv1.PorterApp{ CpuCores: 0.1, RamMegabytes: 256, GpuCoresNvidia: 0, + Gpu: &porterv1.GPU{ + Enabled: false, + GpuCoresNvidia: 0, + }, Config: &porterv1.Service_WorkerConfig{ WorkerConfig: &porterv1.WorkerServiceConfig{ Autoscaling: nil, @@ -100,6 +108,10 @@ var result_nobuild = &porterv1.PorterApp{ CpuCores: 0.1, RamMegabytes: 256, GpuCoresNvidia: 0, + Gpu: &porterv1.GPU{ + Enabled: false, + GpuCoresNvidia: 0, + }, Config: &porterv1.Service_JobConfig{ JobConfig: &porterv1.JobServiceConfig{ AllowConcurrentOptional: pointer.Bool(true), @@ -119,6 +131,10 @@ var result_nobuild = &porterv1.PorterApp{ CpuCores: 0.1, RamMegabytes: 256, GpuCoresNvidia: 0, + Gpu: &porterv1.GPU{ + Enabled: false, + GpuCoresNvidia: 0, + }, Config: &porterv1.Service_WebConfig{ WebConfig: &porterv1.WebServiceConfig{ Autoscaling: &porterv1.Autoscaling{ @@ -152,6 +168,10 @@ var result_nobuild = &porterv1.PorterApp{ CpuCores: 0.1, RamMegabytes: 256, GpuCoresNvidia: 0, + Gpu: &porterv1.GPU{ + Enabled: false, + GpuCoresNvidia: 0, + }, Config: &porterv1.Service_WorkerConfig{ WorkerConfig: &porterv1.WorkerServiceConfig{ Autoscaling: nil, @@ -165,6 +185,10 @@ var result_nobuild = &porterv1.PorterApp{ CpuCores: 0.1, RamMegabytes: 256, GpuCoresNvidia: 0, + Gpu: &porterv1.GPU{ + Enabled: false, + GpuCoresNvidia: 0, + }, Config: &porterv1.Service_JobConfig{ JobConfig: &porterv1.JobServiceConfig{ AllowConcurrentOptional: pointer.Bool(true), @@ -182,6 +206,10 @@ var result_nobuild = &porterv1.PorterApp{ CpuCores: 0, RamMegabytes: 0, GpuCoresNvidia: 0, + Gpu: &porterv1.GPU{ + Enabled: false, + GpuCoresNvidia: 0, + }, Config: &porterv1.Service_JobConfig{}, Type: 3, }, diff --git a/internal/porter_app/testdata/v1_input_no_build_no_image.yaml b/internal/porter_app/testdata/v1_input_no_build_no_image.yaml index bc224f429a..0f0eba9b55 100644 --- a/internal/porter_app/testdata/v1_input_no_build_no_image.yaml +++ b/internal/porter_app/testdata/v1_input_no_build_no_image.yaml @@ -3,6 +3,7 @@ apps: example-job: type: job run: echo 'hello world' + gpu: {} config: allowConcurrent: true resources: @@ -11,52 +12,54 @@ apps: memory: 256Mi schedule: enabled: true - value: '*/10 * * * *' + value: "*/10 * * * *" paused: true cloudsql: enabled: false - connectionName: '' - dbPort: '5432' - serviceAccountJSON: '' + connectionName: "" + dbPort: "5432" + serviceAccountJSON: "" example-wkr: type: worker run: "echo 'work'" + gpu: {} config: - replicaCount: '1' + replicaCount: "1" container: - port: '80' + port: "80" resources: requests: cpu: 100m memory: 256Mi autoscaling: enabled: false - minReplicas: '1' - maxReplicas: '10' - targetCPUUtilizationPercentage: '50' - targetMemoryUtilizationPercentage: '50' + minReplicas: "1" + maxReplicas: "10" + targetCPUUtilizationPercentage: "50" + targetMemoryUtilizationPercentage: "50" cloudsql: enabled: false - connectionName: '' - dbPort: '5432' - serviceAccountJSON: '' + connectionName: "" + dbPort: "5432" + serviceAccountJSON: "" example-web: type: web run: node index.js + gpu: {} config: - replicaCount: '0' + replicaCount: "0" resources: requests: cpu: 100m memory: 256Mi container: - port: '8080' + port: "8080" autoscaling: enabled: true - minReplicas: '1' - maxReplicas: '3' - targetCPUUtilizationPercentage: '60' - targetMemoryUtilizationPercentage: '60' + minReplicas: "1" + maxReplicas: "3" + targetCPUUtilizationPercentage: "60" + targetMemoryUtilizationPercentage: "60" ingress: enabled: true custom_domain: true @@ -66,30 +69,30 @@ apps: porter_hosts: [] annotations: service: - port: '8080' + port: "8080" health: startupProbe: enabled: false - failureThreshold: '3' + failureThreshold: "3" path: /startupz - periodSeconds: '5' + periodSeconds: "5" readinessProbe: enabled: true - failureThreshold: '3' + failureThreshold: "3" path: /healthz - initialDelaySeconds: '0' + initialDelaySeconds: "0" livenessProbe: enabled: true - failureThreshold: '3' + failureThreshold: "3" path: /healthz - periodSeconds: '5' + periodSeconds: "5" cloudsql: enabled: false - connectionName: '' - dbPort: '5432' - serviceAccountJSON: '' + connectionName: "" + dbPort: "5432" + serviceAccountJSON: "" release: run: ls env: - PORT: '8080' - NODE_ENV: 'production' + PORT: "8080" + NODE_ENV: "production" diff --git a/internal/porter_app/testdata/v2_input_no_build_no_env.yaml b/internal/porter_app/testdata/v2_input_no_build_no_env.yaml index 75b2ce45b5..c7043c9445 100644 --- a/internal/porter_app/testdata/v2_input_no_build_no_env.yaml +++ b/internal/porter_app/testdata/v2_input_no_build_no_env.yaml @@ -10,6 +10,10 @@ services: port: 8080 cpuCores: 0.1 ramMegabytes: 256 + gpu: { + enabled: false, + gpuCoresNvidia: 0, + } autoscaling: enabled: true minInstances: 1 @@ -29,15 +33,27 @@ services: cpuCores: 0.1 ramMegabytes: 256 instances: 1 + gpu: { + enabled: false, + gpuCoresNvidia: 0, + } - name: example-job type: job run: echo 'hello world' allowConcurrent: true cpuCores: 0.1 ramMegabytes: 256 - cron: '*/10 * * * *' + cron: "*/10 * * * *" timeoutSeconds: 60 suspendCron: false + gpu: { + enabled: false, + gpuCoresNvidia: 0, + } predeploy: type: job run: ls + gpu: { + enabled: false, + gpuCoresNvidia: 0, + } diff --git a/internal/porter_app/testdata/v2_input_nobuild.yaml b/internal/porter_app/testdata/v2_input_nobuild.yaml index dde8f132d8..0cef0daa24 100644 --- a/internal/porter_app/testdata/v2_input_nobuild.yaml +++ b/internal/porter_app/testdata/v2_input_nobuild.yaml @@ -1,5 +1,5 @@ version: v2 -name: 'test-app' +name: "test-app" image: repository: nginx tag: latest @@ -10,6 +10,7 @@ services: port: 8080 cpuCores: 0.1 ramMegabytes: 256 + gpu: {} autoscaling: enabled: true minInstances: 1 @@ -28,6 +29,7 @@ services: port: 80 cpuCores: 0.1 ramMegabytes: 256 + gpu: {} instances: 1 - name: example-job type: job @@ -35,12 +37,14 @@ services: allowConcurrent: true cpuCores: 0.1 ramMegabytes: 256 - cron: '*/10 * * * *' + gpu: {} + cron: "*/10 * * * *" timeoutSeconds: 60 suspendCron: false predeploy: type: job run: ls + gpu: {} env: PORT: 8080 NODE_ENV: production diff --git a/internal/porter_app/v2/yaml.go b/internal/porter_app/v2/yaml.go index b18e92a641..4bbf05cf89 100644 --- a/internal/porter_app/v2/yaml.go +++ b/internal/porter_app/v2/yaml.go @@ -169,6 +169,7 @@ type Service struct { CpuCores float32 `yaml:"cpuCores,omitempty"` RamMegabytes int `yaml:"ramMegabytes,omitempty"` GpuCoresNvidia float32 `yaml:"gpuCoresNvidia,omitempty"` + GPU *GPU `yaml:"gpu,omitempty"` SmartOptimization *bool `yaml:"smartOptimization,omitempty"` TerminationGracePeriodSeconds *int32 `yaml:"terminationGracePeriodSeconds,omitempty"` Port int `yaml:"port,omitempty"` @@ -193,6 +194,12 @@ type AutoScaling struct { MemoryThresholdPercent int `yaml:"memoryThresholdPercent"` } +// GPU represents GPU settings for a service +type GPU struct { + Enabled bool `yaml:"enabled"` + GpuCoresNvidia int `yaml:"gpuCoresNvidia"` +} + // Domains are the custom domains for a web service type Domains struct { Name string `yaml:"name"` @@ -334,6 +341,14 @@ func serviceProtoFromConfig(service Service, serviceType porterv1.ServiceType) ( TerminationGracePeriodSeconds: service.TerminationGracePeriodSeconds, } + if service.GPU != nil { + gpu := &porterv1.GPU{ + Enabled: service.GPU.Enabled, + GpuCoresNvidia: int32(service.GPU.GpuCoresNvidia), + } + + serviceProto.Gpu = gpu + } switch serviceType { default: return nil, fmt.Errorf("invalid service type '%s'", serviceType) @@ -480,14 +495,18 @@ func AppFromProto(appProto *porterv1.PorterApp) (PorterApp, error) { func appServiceFromProto(service *porterv1.Service) (Service, error) { appService := Service{ - Name: service.Name, - Run: service.RunOptional, - Instances: service.InstancesOptional, - CpuCores: service.CpuCores, - RamMegabytes: int(service.RamMegabytes), - GpuCoresNvidia: service.GpuCoresNvidia, // nolint:staticcheck // https://linear.app/porter/issue/POR-2137/support-new-gpu-field-in-porteryaml - Port: int(service.Port), - SmartOptimization: service.SmartOptimization, + Name: service.Name, + Run: service.RunOptional, + Instances: service.InstancesOptional, + CpuCores: service.CpuCores, + RamMegabytes: int(service.RamMegabytes), + GpuCoresNvidia: service.GpuCoresNvidia, // nolint:staticcheck // https://linear.app/porter/issue/POR-2137/support-new-gpu-field-in-porteryaml + Port: int(service.Port), + SmartOptimization: service.SmartOptimization, + GPU: &GPU{ + Enabled: service.Gpu.Enabled, + GpuCoresNvidia: int(service.Gpu.GpuCoresNvidia), + }, TerminationGracePeriodSeconds: service.TerminationGracePeriodSeconds, }