diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml index 97a5e994d..8d88ea8b0 100644 --- a/.github/workflows/e2e-workflow.yml +++ b/.github/workflows/e2e-workflow.yml @@ -85,7 +85,7 @@ jobs: go-version: ${{ env.GO_VERSION }} - name: Az login - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1 + uses: azure/login@v2.2.0 with: client-id: ${{ secrets.E2E_CLIENT_ID }} tenant-id: ${{ secrets.E2E_TENANT_ID }} diff --git a/Makefile b/Makefile index 7851ed8a5..4477ad6a1 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ # Image URL to use all building/pushing image targets REGISTRY ?= YOUR_REGISTRY IMG_NAME ?= workspace -VERSION ?= v0.3.1 -GPU_PROVISIONER_VERSION ?= 0.2.0 +VERSION ?= v0.3.2 +GPU_PROVISIONER_VERSION ?= 0.2.1 IMG_TAG ?= $(subst v,,$(VERSION)) ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) @@ -262,6 +262,7 @@ gpu-provisioner-helm: ## Update Azure client env vars and settings in helm valu helm install $(GPU_PROVISIONER_NAMESPACE) \ --values gpu-provisioner-values.yaml \ --set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) \ + --namespace $(GPU_PROVISIONER_NAMESPACE) --create-namespace \ https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s diff --git a/api/v1alpha1/workspace_types.go b/api/v1alpha1/workspace_types.go index 64cb9967a..35069423f 100644 --- a/api/v1alpha1/workspace_types.go +++ b/api/v1alpha1/workspace_types.go @@ -34,8 +34,7 @@ type ResourceSpec struct { LabelSelector *metav1.LabelSelector `json:"labelSelector"` // PreferredNodes is an optional node list specified by the user. - // If a node in the list does not have the required labels or - // the required instanceType, it will be ignored. + // If a node in the list does not have the required labels, it will be ignored. // +optional PreferredNodes []string `json:"preferredNodes,omitempty"` } diff --git a/charts/kaito/workspace/Chart.yaml b/charts/kaito/workspace/Chart.yaml index 282e4aa04..fb457be82 100644 --- a/charts/kaito/workspace/Chart.yaml +++ b/charts/kaito/workspace/Chart.yaml @@ -6,13 +6,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.3.1 +version: 0.3.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: 0.3.1 +appVersion: 0.3.2 home: https://github.com/Azure/kaito sources: - https://github.com/Azure/kaito diff --git a/charts/kaito/workspace/README.md b/charts/kaito/workspace/README.md index 067b94450..3315c5e39 100644 --- a/charts/kaito/workspace/README.md +++ b/charts/kaito/workspace/README.md @@ -5,7 +5,7 @@ ```bash export REGISTRY=mcr.microsoft.com/aks/kaito export IMG_NAME=workspace -export IMG_TAG=0.3.1 +export IMG_TAG=0.3.2 helm install workspace ./charts/kaito/workspace \ --set image.repository=${REGISTRY}/$(IMG_NAME) --set image.tag=$(IMG_TAG) \ --namespace kaito-workspace --create-namespace diff --git a/charts/kaito/workspace/values.yaml b/charts/kaito/workspace/values.yaml index 205d26901..702ec60ea 100644 --- a/charts/kaito/workspace/values.yaml +++ b/charts/kaito/workspace/values.yaml @@ -5,7 +5,7 @@ replicaCount: 1 image: repository: mcr.microsoft.com/aks/kaito/workspace pullPolicy: IfNotPresent - tag: 0.3.1 + tag: 0.3.2 imagePullSecrets: [] podAnnotations: {} podSecurityContext: diff --git a/docs/installation.md b/docs/installation.md index 86ec80843..df201c108 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -77,7 +77,7 @@ az role assignment create --assignee $IDENTITY_PRINCIPAL_ID --scope /subscriptio Install the Node provisioner controller. ```bash # get additional values for helm chart install -export GPU_PROVISIONER_VERSION=0.2.0 +export GPU_PROVISIONER_VERSION=0.2.1 curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/hack/deploy/configure-helm-values.sh chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $MY_CLUSTER $RESOURCE_GROUP $IDENTITY_NAME diff --git a/examples/inference/kaito_workspace_phi_3_medium_128k.yaml b/examples/inference/kaito_workspace_phi_3_medium_128k.yaml new file mode 100644 index 000000000..41a137522 --- /dev/null +++ b/examples/inference/kaito_workspace_phi_3_medium_128k.yaml @@ -0,0 +1,12 @@ +apiVersion: kaito.sh/v1alpha1 +kind: Workspace +metadata: + name: workspace-phi-3-medium +resource: + instanceType: "Standard_NC24ads_A100_v4" + labelSelector: + matchLabels: + apps: phi-3 +inference: + preset: + name: phi-3-medium-128k-instruct diff --git a/examples/inference/kaito_workspace_phi_3_medium.yaml b/examples/inference/kaito_workspace_phi_3_medium_4k.yaml similarity index 74% rename from examples/inference/kaito_workspace_phi_3_medium.yaml rename to examples/inference/kaito_workspace_phi_3_medium_4k.yaml index 27cc63e25..7a44d7590 100644 --- a/examples/inference/kaito_workspace_phi_3_medium.yaml +++ b/examples/inference/kaito_workspace_phi_3_medium_4k.yaml @@ -10,4 +10,3 @@ resource: inference: preset: name: phi-3-medium-4k-instruct - # Note: This configuration also works with the phi-3-medium-128k-instruct preset diff --git a/examples/inference/kaito_workspace_phi_3_mini_128k.yaml b/examples/inference/kaito_workspace_phi_3_mini_128k.yaml new file mode 100644 index 000000000..162495ef8 --- /dev/null +++ b/examples/inference/kaito_workspace_phi_3_mini_128k.yaml @@ -0,0 +1,12 @@ +apiVersion: kaito.sh/v1alpha1 +kind: Workspace +metadata: + name: workspace-phi-3-mini +resource: + instanceType: "Standard_NC6s_v3" + labelSelector: + matchLabels: + apps: phi-3 +inference: + preset: + name: phi-3-mini-128k-instruct diff --git a/examples/inference/kaito_workspace_phi_3_mini.yaml b/examples/inference/kaito_workspace_phi_3_mini_4k.yaml similarity index 73% rename from examples/inference/kaito_workspace_phi_3_mini.yaml rename to examples/inference/kaito_workspace_phi_3_mini_4k.yaml index aa2a2cfe2..33cd49d68 100644 --- a/examples/inference/kaito_workspace_phi_3_mini.yaml +++ b/examples/inference/kaito_workspace_phi_3_mini_4k.yaml @@ -10,4 +10,3 @@ resource: inference: preset: name: phi-3-mini-4k-instruct - # Note: This configuration also works with the phi-3-mini-128k-instruct preset diff --git a/pkg/controllers/workspace_controller.go b/pkg/controllers/workspace_controller.go index d193a7d8c..25ef22faa 100644 --- a/pkg/controllers/workspace_controller.go +++ b/pkg/controllers/workspace_controller.go @@ -40,6 +40,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" "k8s.io/utils/clock" @@ -385,7 +386,7 @@ func (c *WorkspaceReconciler) applyWorkspaceResource(ctx context.Context, wObj * } } - // Find all nodes that match the labelSelector and instanceType, they are not necessarily created by machines/nodeClaims. + // Find all nodes that meet the requirements, they are not necessarily created by machines/nodeClaims. validNodes, err := c.getAllQualifiedNodes(ctx, wObj) if err != nil { return err @@ -474,7 +475,6 @@ func (c *WorkspaceReconciler) applyWorkspaceResource(ctx context.Context, wObj * return nil } -// getAllQualifiedNodes returns all nodes that match the labelSelector and instanceType. func (c *WorkspaceReconciler) getAllQualifiedNodes(ctx context.Context, wObj *kaitov1alpha1.Workspace) ([]*corev1.Node, error) { var qualifiedNodes []*corev1.Node @@ -488,33 +488,36 @@ func (c *WorkspaceReconciler) getAllQualifiedNodes(ctx context.Context, wObj *ka return nil, nil } + preferredNodeSet := sets.New(wObj.Resource.PreferredNodes...) + for index := range nodeList.Items { nodeObj := nodeList.Items[index] - // Skip nodes that are being deleted + // skip nodes that are being deleted if nodeObj.DeletionTimestamp != nil { continue } - foundInstanceType := c.validateNodeInstanceType(ctx, wObj, lo.ToPtr(nodeObj)) + + // skip nodes that are not ready _, statusRunning := lo.Find(nodeObj.Status.Conditions, func(condition corev1.NodeCondition) bool { return condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue }) + if !statusRunning { + continue + } - if foundInstanceType && statusRunning { + // match the preferred node + if preferredNodeSet.Has(nodeObj.Name) { qualifiedNodes = append(qualifiedNodes, lo.ToPtr(nodeObj)) + continue } - } - return qualifiedNodes, nil -} - -// check if node has the required instanceType -func (c *WorkspaceReconciler) validateNodeInstanceType(ctx context.Context, wObj *kaitov1alpha1.Workspace, nodeObj *corev1.Node) bool { - if instanceTypeLabel, found := nodeObj.Labels[corev1.LabelInstanceTypeStable]; found { - if instanceTypeLabel != wObj.Resource.InstanceType { - return false + // match the instanceType + if nodeObj.Labels[corev1.LabelInstanceTypeStable] == wObj.Resource.InstanceType { + qualifiedNodes = append(qualifiedNodes, lo.ToPtr(nodeObj)) } } - return true + + return qualifiedNodes, nil } // createAndValidateNode creates a new node and validates status. diff --git a/pkg/controllers/workspace_controller_test.go b/pkg/controllers/workspace_controller_test.go index f71493523..ba473bc07 100644 --- a/pkg/controllers/workspace_controller_test.go +++ b/pkg/controllers/workspace_controller_test.go @@ -751,29 +751,108 @@ func TestApplyInferenceWithTemplate(t *testing.T) { } func TestGetAllQualifiedNodes(t *testing.T) { + deletedNode := corev1.Node{ + ObjectMeta: v1.ObjectMeta{ + Name: "node4", + Labels: map[string]string{ + corev1.LabelInstanceTypeStable: "Standard_NC12s_v3", + }, + DeletionTimestamp: &v1.Time{Time: time.Now()}, + }, + } + testcases := map[string]struct { callMocks func(c *test.MockClient) + workspace *v1alpha1.Workspace expectedError error + expectedNodes []string }{ "Fails to get qualified nodes because can't list nodes": { callMocks: func(c *test.MockClient) { c.On("List", mock.IsType(context.Background()), mock.IsType(&corev1.NodeList{}), mock.Anything).Return(errors.New("Failed to list nodes")) }, + workspace: test.MockWorkspaceDistributedModel, expectedError: errors.New("Failed to list nodes"), + expectedNodes: nil, }, "Gets all qualified nodes": { callMocks: func(c *test.MockClient) { nodeList := test.MockNodeList - deletedNode := corev1.Node{ - ObjectMeta: v1.ObjectMeta{ - Name: "node4", - Labels: map[string]string{ - corev1.LabelInstanceTypeStable: "Standard_NC12s_v3", + + nodeList.Items = append(nodeList.Items, deletedNode) + + relevantMap := c.CreateMapWithType(nodeList) + //insert node objects into the map + for _, obj := range test.MockNodeList.Items { + n := obj + objKey := client.ObjectKeyFromObject(&n) + + relevantMap[objKey] = &n + } + + c.On("List", mock.IsType(context.Background()), mock.IsType(&corev1.NodeList{}), mock.Anything).Return(nil) + }, + workspace: test.MockWorkspaceDistributedModel, + expectedError: nil, + expectedNodes: []string{"node1"}, + }, + "Gets all qualified nodes with preferred": { + callMocks: func(c *test.MockClient) { + nodeList := test.MockNodeList + + nodeList.Items = append(nodeList.Items, deletedNode) + + nodesFromOtherVendor := []corev1.Node{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node-p1", + Labels: map[string]string{ + corev1.LabelInstanceTypeStable: "vendor1", + }, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + { + ObjectMeta: v1.ObjectMeta{ + Name: "node-p2", + Labels: map[string]string{ + corev1.LabelInstanceTypeStable: "vendor2", + }, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionFalse, + }, + }, + }, + }, + { + ObjectMeta: v1.ObjectMeta{ + Name: "node-p3", + Labels: map[string]string{ + corev1.LabelInstanceTypeStable: "vendor1", + }, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, }, - DeletionTimestamp: &v1.Time{Time: time.Now()}, }, } - nodeList.Items = append(nodeList.Items, deletedNode) + nodeList.Items = append(nodeList.Items, nodesFromOtherVendor...) relevantMap := c.CreateMapWithType(nodeList) //insert node objects into the map @@ -786,14 +865,15 @@ func TestGetAllQualifiedNodes(t *testing.T) { c.On("List", mock.IsType(context.Background()), mock.IsType(&corev1.NodeList{}), mock.Anything).Return(nil) }, + workspace: test.MockWorkspaceWithPreferredNodes, expectedError: nil, + expectedNodes: []string{"node1", "node-p1"}, }, } for k, tc := range testcases { t.Run(k, func(t *testing.T) { mockClient := test.NewClient() - mockWorkspace := test.MockWorkspaceDistributedModel reconciler := &WorkspaceReconciler{ Client: mockClient, Scheme: test.NewTestScheme(), @@ -802,15 +882,17 @@ func TestGetAllQualifiedNodes(t *testing.T) { tc.callMocks(mockClient) - nodes, err := reconciler.getAllQualifiedNodes(ctx, mockWorkspace) - if tc.expectedError == nil { - assert.Check(t, err == nil, "Not expected to return error") - assert.Check(t, nodes != nil, "Response node array should not be nil") - assert.Check(t, len(nodes) == 1, "One out of three nodes should be qualified") - } else { + nodes, err := reconciler.getAllQualifiedNodes(ctx, tc.workspace) + + if tc.expectedError != nil { assert.Equal(t, tc.expectedError.Error(), err.Error()) assert.Check(t, nodes == nil, "Response node array should be nil") + return } + + assert.Check(t, err == nil, "Not expected to return error") + assert.Check(t, nodes != nil, "Response node array should not be nil") + assert.Check(t, len(nodes) == len(tc.expectedNodes), "Unexpected qualified nodes") }) } } diff --git a/pkg/utils/test/testUtils.go b/pkg/utils/test/testUtils.go index d952852d9..a3eee57b4 100644 --- a/pkg/utils/test/testUtils.go +++ b/pkg/utils/test/testUtils.go @@ -47,6 +47,29 @@ var ( }, }, } + MockWorkspaceWithPreferredNodes = &v1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testWorkspace", + Namespace: "kaito", + }, + Resource: v1alpha1.ResourceSpec{ + Count: &gpuNodeCount, + InstanceType: "Standard_NC12s_v3", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "apps": "test", + }, + }, + PreferredNodes: []string{"node-p1", "node-p2"}, + }, + Inference: &v1alpha1.InferenceSpec{ + Preset: &v1alpha1.PresetSpec{ + PresetMeta: v1alpha1.PresetMeta{ + Name: "test-distributed-model", + }, + }, + }, + } ) var ( diff --git a/presets/models/phi2/README.md b/presets/models/phi2/README.md index 8ecaaa4b1..cfa183df9 100644 --- a/presets/models/phi2/README.md +++ b/presets/models/phi2/README.md @@ -1,7 +1,7 @@ ## Supported Models |Model name| Model source | Sample workspace|Kubernetes Workload|Distributed inference| |----|:----:|:----:| :----: |:----: | -|phi-2 |[microsoft](https://huggingface.co/microsoft/phi-2)|[link](../../../examples/inference/kaito_workspace_phi_3.yaml)|Deployment| false| +|phi-2 |[microsoft](https://huggingface.co/microsoft/phi-2)|[link](../../../examples/inference/kaito_workspace_phi_2.yaml)|Deployment| false| ## Image Source diff --git a/presets/models/phi3/README.md b/presets/models/phi3/README.md index ce75b9eeb..dc98ae2c4 100644 --- a/presets/models/phi3/README.md +++ b/presets/models/phi3/README.md @@ -1,10 +1,10 @@ ## Supported Models | Model name | Model source | Sample workspace|Kubernetes Workload|Distributed inference| |--------------------------|:-----------------------------------------------------------------------:|:----:| :----: |:----: | -| phi-3-mini-4k-instruct | [microsoft](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |[link](../../../examples/inference/kaito_workspace_phi_3.yaml)|Deployment| false| -| phi-3-mini-128k-instruct | [microsoft](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) |[link](../../../examples/inference/kaito_workspace_phi_3.yaml)|Deployment| false| -| phi-3-mini-4k-instruct | [microsoft](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct) |[link](../../../examples/inference/kaito_workspace_phi_3.yaml)|Deployment| false| -| phi-3-mini-128k-instruct | [microsoft](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) |[link](../../../examples/inference/kaito_workspace_phi_3.yaml)|Deployment| false| +| phi-3-mini-4k-instruct | [microsoft](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |[link](../../../examples/inference/kaito_workspace_phi_3_mini_4k.yaml)|Deployment| false| +| phi-3-mini-128k-instruct | [microsoft](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) |[link](../../../examples/inference/kaito_workspace_phi_3_mini_128k.yaml)|Deployment| false| +| phi-3-medium-4k-instruct | [microsoft](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct) |[link](../../../examples/inference/kaito_workspace_phi_3_medium_4k.yaml)|Deployment| false| +| phi-3-medium-128k-instruct | [microsoft](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) |[link](../../../examples/inference/kaito_workspace_phi_3_medium_128k.yaml)|Deployment| false| ## Image Source - **Public**: Kaito maintainers manage the lifecycle of the inference service images that contain model weights. The images are available in Microsoft Container Registry (MCR).