Gpu refactor (#4021)

porter-dev · Nov 28, 2023 · f07f194 · f07f194
1 parent 89d9a47
commit f07f194
Show file tree

Hide file tree

Showing 8 changed files with 171 additions and 75 deletions.
diff --git a/dashboard/src/lib/porter-apps/services.ts b/dashboard/src/lib/porter-apps/services.ts
@@ -82,6 +82,10 @@ export const serviceValidator = z.object({
   cpuCores: serviceNumberValidator,
   ramMegabytes: serviceNumberValidator,
   gpuCoresNvidia: serviceNumberValidator,
+  gpu: z.object({
+    enabled: serviceBooleanValidator,
+    gpuCoresNvidia: serviceNumberValidator,
+  }),
   smartOptimization: serviceBooleanValidator.optional(),
   terminationGracePeriodSeconds: serviceNumberValidator.optional(),
   config: z.discriminatedUnion("type", [
@@ -117,6 +121,10 @@ export type SerializedService = {
   ramMegabytes: number;
   smartOptimization?: boolean;
   gpuCoresNvidia: number;
+  gpu: {
+    enabled: boolean;
+    gpuCoresNvidia: number;
+  };
   terminationGracePeriodSeconds?: number;
   config:
     | {
@@ -196,6 +204,10 @@ export function defaultSerialized({
     cpuCores: defaultCPU,
     ramMegabytes: defaultRAM,
     gpuCoresNvidia: 0,
+    gpu: {
+      enabled: false,
+      gpuCoresNvidia: 0,
+    },
     smartOptimization: true,
   };
 
@@ -264,6 +276,10 @@ export function serializeService(service: ClientService): SerializedService {
     ramMegabytes: Math.round(service.ramMegabytes.value), // RAM must be an integer
     smartOptimization: service.smartOptimization?.value,
     gpuCoresNvidia: service.gpuCoresNvidia.value,
+    gpu: {
+      enabled: service.gpu.enabled.value,
+      gpuCoresNvidia: service.gpu.gpuCoresNvidia.value,
+    },
     terminationGracePeriodSeconds: service.terminationGracePeriodSeconds?.value,
     config: match(service.config)
       .with({ type: "web" }, (config) =>
@@ -336,6 +352,16 @@ export function deserializeService({
     instances: ServiceField.number(service.instances, override?.instances),
     port: ServiceField.number(service.port, override?.port),
     cpuCores: ServiceField.number(service.cpuCores, override?.cpuCores),
+    gpu: {
+      enabled: ServiceField.boolean(
+        service.gpu?.enabled,
+        override?.gpu.enabled
+      ),
+      gpuCoresNvidia: ServiceField.number(
+        service.gpu?.gpuCoresNvidia,
+        override?.gpu?.gpuCoresNvidia
+      ),
+    },
     gpuCoresNvidia: ServiceField.number(
       service.gpuCoresNvidia,
       override?.gpuCoresNvidia

diff --git a/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/ServiceContainer.tsx b/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/ServiceContainer.tsx
@@ -154,7 +154,7 @@ const ServiceContainer: React.FC<ServiceProps> = ({
           {service.name.value.trim().length > 0
             ? service.name.value
             : "New Service"}
-          {service.gpuCoresNvidia.value > 0 && (
+          {service.gpu.enabled.value && (
             <>
               <Spacer inline x={1.5} />
               <TagContainer>
@@ -276,7 +276,7 @@ const ServiceHeader = styled.div<{
     border-radius: 20px;
     margin-left: -10px;
     transform: ${(props: { showExpanded?: boolean }) =>
-      props.showExpanded ? "" : "rotate(-90deg)"};
+    props.showExpanded ? "" : "rotate(-90deg)"};
   }
 `;
 

diff --git a/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/Resources.tsx b/dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/Resources.tsx
@@ -227,27 +227,28 @@ const Resources: React.FC<ResourcesProps> = ({
           <>
             <Spacer y={1} />
             <Controller
-              name={`app.services.${index}.gpuCoresNvidia`}
+              name={`app.services.${index}.gpu`}
               control={control}
               render={({ field: { value, onChange } }) => (
                 <>
                   <Container row>
                     <Switch
                       size="small"
                       color="primary"
-                      checked={value.value > 0}
+                      checked={value.enabled.value}
                       disabled={!clusterContainsGPUNodes}
                       onChange={() => {
-                        if (value.value > 0) {
-                          onChange({
-                            ...value,
-                            value: 0,
-                          });
-                        } else
-                          onChange({
-                            ...value,
-                            value: 1,
-                          });
+                        onChange({
+                          ...value,
+                          enabled: {
+                            ...value.enabled,
+                            value: !value.enabled.value,
+                          },
+                          gpuCoresNvidia: {
+                            ...value.gpuCoresNvidia,
+                            value: value.enabled.value ? 0 : 1,
+                          }
+                        });
                       }}
                       inputProps={{ "aria-label": "controlled" }}
                     />
@@ -264,7 +265,7 @@ const Resources: React.FC<ResourcesProps> = ({
                           You cluster has no GPU nodes available.
                         </Text>
                         <Spacer inline x={0.5} />
-                        <Tag>
+                        {currentCluster.status !== "UPDATING" && <Tag>
                           <Link
                             onClick={() => {
                               setClusterModalVisible(true);
@@ -273,7 +274,7 @@ const Resources: React.FC<ResourcesProps> = ({
                             <TagIcon src={addCircle} />
                             Add GPU nodes
                           </Link>
-                        </Tag>
+                        </Tag>}
                       </>
                     )}
                   </Container>
@@ -290,23 +291,22 @@ const Resources: React.FC<ResourcesProps> = ({
                 </>
               )}
             />
-            {currentCluster.status === "UPDATING" &&
-              clusterContainsGPUNodes && (
-                <CheckItemContainer>
-                  <CheckItemTop>
-                    <Loading offset="0px" width="20px" height="20px" />
-                    <Spacer inline x={1} />
-                    <Text>{"Creating GPU nodes..."}</Text>
-                    <Spacer inline x={1} />
-                    <Tag>
-                      <Link to={`/cluster-dashboard`}>
-                        <TagIcon src={infra} />
-                        View Status
-                      </Link>
-                    </Tag>
-                  </CheckItemTop>
-                </CheckItemContainer>
-              )}
+            {(currentCluster.status === "UPDATING" && !clusterContainsGPUNodes) && (
+              <CheckItemContainer>
+                <CheckItemTop>
+                  <Loading offset="0px" width="20px" height="20px" />
+                  <Spacer inline x={1} />
+                  <Text>{"Cluster is updating..."}</Text>
+                  <Spacer inline x={1} />
+                  <Tag>
+                    <Link to={`/cluster-dashboard`}>
+                      <TagIcon src={infra} />
+                      View Status
+                    </Link>
+                  </Tag>
+                </CheckItemTop>
+              </CheckItemContainer>
+            )}
           </>
         )}
       {match(service.config)

diff --git a/internal/porter_app/test/parse_test.go b/internal/porter_app/test/parse_test.go
@@ -54,6 +54,10 @@ var result_nobuild = &porterv1.PorterApp{
 			Port:         8080,
 			CpuCores:     0.1,
 			RamMegabytes: 256,
+			Gpu: &porterv1.GPU{
+				Enabled:        false,
+				GpuCoresNvidia: 0,
+			},
 			Config: &porterv1.Service_WebConfig{
 				WebConfig: &porterv1.WebServiceConfig{
 					Autoscaling: &porterv1.Autoscaling{
@@ -87,6 +91,10 @@ var result_nobuild = &porterv1.PorterApp{
 			CpuCores:          0.1,
 			RamMegabytes:      256,
 			GpuCoresNvidia:    0,
+			Gpu: &porterv1.GPU{
+				Enabled:        false,
+				GpuCoresNvidia: 0,
+			},
 			Config: &porterv1.Service_WorkerConfig{
 				WorkerConfig: &porterv1.WorkerServiceConfig{
 					Autoscaling: nil,
@@ -100,6 +108,10 @@ var result_nobuild = &porterv1.PorterApp{
 			CpuCores:       0.1,
 			RamMegabytes:   256,
 			GpuCoresNvidia: 0,
+			Gpu: &porterv1.GPU{
+				Enabled:        false,
+				GpuCoresNvidia: 0,
+			},
 			Config: &porterv1.Service_JobConfig{
 				JobConfig: &porterv1.JobServiceConfig{
 					AllowConcurrentOptional: pointer.Bool(true),
@@ -119,6 +131,10 @@ var result_nobuild = &porterv1.PorterApp{
 			CpuCores:       0.1,
 			RamMegabytes:   256,
 			GpuCoresNvidia: 0,
+			Gpu: &porterv1.GPU{
+				Enabled:        false,
+				GpuCoresNvidia: 0,
+			},
 			Config: &porterv1.Service_WebConfig{
 				WebConfig: &porterv1.WebServiceConfig{
 					Autoscaling: &porterv1.Autoscaling{
@@ -152,6 +168,10 @@ var result_nobuild = &porterv1.PorterApp{
 			CpuCores:          0.1,
 			RamMegabytes:      256,
 			GpuCoresNvidia:    0,
+			Gpu: &porterv1.GPU{
+				Enabled:        false,
+				GpuCoresNvidia: 0,
+			},
 			Config: &porterv1.Service_WorkerConfig{
 				WorkerConfig: &porterv1.WorkerServiceConfig{
 					Autoscaling: nil,
@@ -165,6 +185,10 @@ var result_nobuild = &porterv1.PorterApp{
 			CpuCores:       0.1,
 			RamMegabytes:   256,
 			GpuCoresNvidia: 0,
+			Gpu: &porterv1.GPU{
+				Enabled:        false,
+				GpuCoresNvidia: 0,
+			},
 			Config: &porterv1.Service_JobConfig{
 				JobConfig: &porterv1.JobServiceConfig{
 					AllowConcurrentOptional: pointer.Bool(true),
@@ -182,6 +206,10 @@ var result_nobuild = &porterv1.PorterApp{
 		CpuCores:       0,
 		RamMegabytes:   0,
 		GpuCoresNvidia: 0,
+		Gpu: &porterv1.GPU{
+			Enabled:        false,
+			GpuCoresNvidia: 0,
+		},
 		Config:         &porterv1.Service_JobConfig{},
 		Type:           3,
 	},

diff --git a/internal/porter_app/testdata/v1_input_no_build_no_image.yaml b/internal/porter_app/testdata/v1_input_no_build_no_image.yaml
@@ -3,6 +3,7 @@ apps:
   example-job:
     type: job
     run: echo 'hello world'
+    gpu: {}
     config:
       allowConcurrent: true
       resources:
@@ -11,52 +12,54 @@ apps:
           memory: 256Mi
       schedule:
         enabled: true
-        value: '*/10 * * * *'
+        value: "*/10 * * * *"
       paused: true
       cloudsql:
         enabled: false
-        connectionName: ''
-        dbPort: '5432'
-        serviceAccountJSON: ''
+        connectionName: ""
+        dbPort: "5432"
+        serviceAccountJSON: ""
   example-wkr:
     type: worker
     run: "echo 'work'"
+    gpu: {}
     config:
-      replicaCount: '1'
+      replicaCount: "1"
       container:
-        port: '80'
+        port: "80"
       resources:
         requests:
           cpu: 100m
           memory: 256Mi
       autoscaling:
         enabled: false
-        minReplicas: '1'
-        maxReplicas: '10'
-        targetCPUUtilizationPercentage: '50'
-        targetMemoryUtilizationPercentage: '50'
+        minReplicas: "1"
+        maxReplicas: "10"
+        targetCPUUtilizationPercentage: "50"
+        targetMemoryUtilizationPercentage: "50"
       cloudsql:
         enabled: false
-        connectionName: ''
-        dbPort: '5432'
-        serviceAccountJSON: ''
+        connectionName: ""
+        dbPort: "5432"
+        serviceAccountJSON: ""
   example-web:
     type: web
     run: node index.js
+    gpu: {}
     config:
-      replicaCount: '0'
+      replicaCount: "0"
       resources:
         requests:
           cpu: 100m
           memory: 256Mi
       container:
-        port: '8080'
+        port: "8080"
       autoscaling:
         enabled: true
-        minReplicas: '1'
-        maxReplicas: '3'
-        targetCPUUtilizationPercentage: '60'
-        targetMemoryUtilizationPercentage: '60'
+        minReplicas: "1"
+        maxReplicas: "3"
+        targetCPUUtilizationPercentage: "60"
+        targetMemoryUtilizationPercentage: "60"
       ingress:
         enabled: true
         custom_domain: true
@@ -66,30 +69,30 @@ apps:
         porter_hosts: []
         annotations:
       service:
-        port: '8080'
+        port: "8080"
       health:
         startupProbe:
           enabled: false
-          failureThreshold: '3'
+          failureThreshold: "3"
           path: /startupz
-          periodSeconds: '5'
+          periodSeconds: "5"
         readinessProbe:
           enabled: true
-          failureThreshold: '3'
+          failureThreshold: "3"
           path: /healthz
-          initialDelaySeconds: '0'
+          initialDelaySeconds: "0"
         livenessProbe:
           enabled: true
-          failureThreshold: '3'
+          failureThreshold: "3"
           path: /healthz
-          periodSeconds: '5'
+          periodSeconds: "5"
       cloudsql:
         enabled: false
-        connectionName: ''
-        dbPort: '5432'
-        serviceAccountJSON: ''
+        connectionName: ""
+        dbPort: "5432"
+        serviceAccountJSON: ""
 release:
   run: ls
 env:
-  PORT: '8080'
-  NODE_ENV: 'production'
+  PORT: "8080"
+  NODE_ENV: "production"