Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release: Update manifest and helm charts for v0.3.2 #695

Merged
merged 4 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ jobs:
go-version: ${{ env.GO_VERSION }}

- name: Az login
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1
uses: azure/login@v2.2.0
with:
client-id: ${{ secrets.E2E_CLIENT_ID }}
tenant-id: ${{ secrets.E2E_TENANT_ID }}
Expand Down
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# Image URL to use all building/pushing image targets
REGISTRY ?= YOUR_REGISTRY
IMG_NAME ?= workspace
VERSION ?= v0.3.1
GPU_PROVISIONER_VERSION ?= 0.2.0
VERSION ?= v0.3.2
GPU_PROVISIONER_VERSION ?= 0.2.1
IMG_TAG ?= $(subst v,,$(VERSION))

ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
Expand Down Expand Up @@ -262,6 +262,7 @@ gpu-provisioner-helm: ## Update Azure client env vars and settings in helm valu
helm install $(GPU_PROVISIONER_NAMESPACE) \
--values gpu-provisioner-values.yaml \
--set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) \
--namespace $(GPU_PROVISIONER_NAMESPACE) --create-namespace \
https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz

kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s
Expand Down
3 changes: 1 addition & 2 deletions api/v1alpha1/workspace_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ type ResourceSpec struct {
LabelSelector *metav1.LabelSelector `json:"labelSelector"`

// PreferredNodes is an optional node list specified by the user.
// If a node in the list does not have the required labels or
// the required instanceType, it will be ignored.
// If a node in the list does not have the required labels, it will be ignored.
// +optional
PreferredNodes []string `json:"preferredNodes,omitempty"`
}
Expand Down
4 changes: 2 additions & 2 deletions charts/kaito/workspace/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.3.1
version: 0.3.2

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: 0.3.1
appVersion: 0.3.2
home: https://github.com/Azure/kaito
sources:
- https://github.com/Azure/kaito
Expand Down
2 changes: 1 addition & 1 deletion charts/kaito/workspace/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
```bash
export REGISTRY=mcr.microsoft.com/aks/kaito
export IMG_NAME=workspace
export IMG_TAG=0.3.1
export IMG_TAG=0.3.2
helm install workspace ./charts/kaito/workspace \
--set image.repository=${REGISTRY}/$(IMG_NAME) --set image.tag=$(IMG_TAG) \
--namespace kaito-workspace --create-namespace
Expand Down
2 changes: 1 addition & 1 deletion charts/kaito/workspace/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ replicaCount: 1
image:
repository: mcr.microsoft.com/aks/kaito/workspace
pullPolicy: IfNotPresent
tag: 0.3.1
tag: 0.3.2
imagePullSecrets: []
podAnnotations: {}
podSecurityContext:
Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ az role assignment create --assignee $IDENTITY_PRINCIPAL_ID --scope /subscriptio
Install the Node provisioner controller.
```bash
# get additional values for helm chart install
export GPU_PROVISIONER_VERSION=0.2.0
export GPU_PROVISIONER_VERSION=0.2.1

curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/hack/deploy/configure-helm-values.sh
chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $MY_CLUSTER $RESOURCE_GROUP $IDENTITY_NAME
Expand Down
12 changes: 12 additions & 0 deletions examples/inference/kaito_workspace_phi_3_medium_128k.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: kaito.sh/v1alpha1
kind: Workspace
metadata:
name: workspace-phi-3-medium
resource:
instanceType: "Standard_NC24ads_A100_v4"
labelSelector:
matchLabels:
apps: phi-3
inference:
preset:
name: phi-3-medium-128k-instruct
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,3 @@ resource:
inference:
preset:
name: phi-3-medium-4k-instruct
# Note: This configuration also works with the phi-3-medium-128k-instruct preset
12 changes: 12 additions & 0 deletions examples/inference/kaito_workspace_phi_3_mini_128k.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: kaito.sh/v1alpha1
kind: Workspace
metadata:
name: workspace-phi-3-mini
resource:
instanceType: "Standard_NC6s_v3"
labelSelector:
matchLabels:
apps: phi-3
inference:
preset:
name: phi-3-mini-128k-instruct
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,3 @@ resource:
inference:
preset:
name: phi-3-mini-4k-instruct
# Note: This configuration also works with the phi-3-mini-128k-instruct preset
33 changes: 18 additions & 15 deletions pkg/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/record"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
Expand Down Expand Up @@ -385,7 +386,7 @@ func (c *WorkspaceReconciler) applyWorkspaceResource(ctx context.Context, wObj *
}
}

// Find all nodes that match the labelSelector and instanceType, they are not necessarily created by machines/nodeClaims.
// Find all nodes that meet the requirements, they are not necessarily created by machines/nodeClaims.
validNodes, err := c.getAllQualifiedNodes(ctx, wObj)
if err != nil {
return err
Expand Down Expand Up @@ -474,7 +475,6 @@ func (c *WorkspaceReconciler) applyWorkspaceResource(ctx context.Context, wObj *
return nil
}

// getAllQualifiedNodes returns all nodes that match the labelSelector and instanceType.
func (c *WorkspaceReconciler) getAllQualifiedNodes(ctx context.Context, wObj *kaitov1alpha1.Workspace) ([]*corev1.Node, error) {
var qualifiedNodes []*corev1.Node

Expand All @@ -488,33 +488,36 @@ func (c *WorkspaceReconciler) getAllQualifiedNodes(ctx context.Context, wObj *ka
return nil, nil
}

preferredNodeSet := sets.New(wObj.Resource.PreferredNodes...)

for index := range nodeList.Items {
nodeObj := nodeList.Items[index]
// Skip nodes that are being deleted
// skip nodes that are being deleted
if nodeObj.DeletionTimestamp != nil {
continue
}
foundInstanceType := c.validateNodeInstanceType(ctx, wObj, lo.ToPtr(nodeObj))

// skip nodes that are not ready
_, statusRunning := lo.Find(nodeObj.Status.Conditions, func(condition corev1.NodeCondition) bool {
return condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue
})
if !statusRunning {
continue
}

if foundInstanceType && statusRunning {
// match the preferred node
if preferredNodeSet.Has(nodeObj.Name) {
qualifiedNodes = append(qualifiedNodes, lo.ToPtr(nodeObj))
continue
}
}

return qualifiedNodes, nil
}

// check if node has the required instanceType
func (c *WorkspaceReconciler) validateNodeInstanceType(ctx context.Context, wObj *kaitov1alpha1.Workspace, nodeObj *corev1.Node) bool {
if instanceTypeLabel, found := nodeObj.Labels[corev1.LabelInstanceTypeStable]; found {
if instanceTypeLabel != wObj.Resource.InstanceType {
return false
// match the instanceType
if nodeObj.Labels[corev1.LabelInstanceTypeStable] == wObj.Resource.InstanceType {
qualifiedNodes = append(qualifiedNodes, lo.ToPtr(nodeObj))
}
}
return true

return qualifiedNodes, nil
}

// createAndValidateNode creates a new node and validates status.
Expand Down
110 changes: 96 additions & 14 deletions pkg/controllers/workspace_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -751,29 +751,108 @@ func TestApplyInferenceWithTemplate(t *testing.T) {
}

func TestGetAllQualifiedNodes(t *testing.T) {
deletedNode := corev1.Node{
ObjectMeta: v1.ObjectMeta{
Name: "node4",
Labels: map[string]string{
corev1.LabelInstanceTypeStable: "Standard_NC12s_v3",
},
DeletionTimestamp: &v1.Time{Time: time.Now()},
},
}

testcases := map[string]struct {
callMocks func(c *test.MockClient)
workspace *v1alpha1.Workspace
expectedError error
expectedNodes []string
}{
"Fails to get qualified nodes because can't list nodes": {
callMocks: func(c *test.MockClient) {
c.On("List", mock.IsType(context.Background()), mock.IsType(&corev1.NodeList{}), mock.Anything).Return(errors.New("Failed to list nodes"))
},
workspace: test.MockWorkspaceDistributedModel,
expectedError: errors.New("Failed to list nodes"),
expectedNodes: nil,
},
"Gets all qualified nodes": {
callMocks: func(c *test.MockClient) {
nodeList := test.MockNodeList
deletedNode := corev1.Node{
ObjectMeta: v1.ObjectMeta{
Name: "node4",
Labels: map[string]string{
corev1.LabelInstanceTypeStable: "Standard_NC12s_v3",

nodeList.Items = append(nodeList.Items, deletedNode)

relevantMap := c.CreateMapWithType(nodeList)
//insert node objects into the map
for _, obj := range test.MockNodeList.Items {
n := obj
objKey := client.ObjectKeyFromObject(&n)

relevantMap[objKey] = &n
}

c.On("List", mock.IsType(context.Background()), mock.IsType(&corev1.NodeList{}), mock.Anything).Return(nil)
},
workspace: test.MockWorkspaceDistributedModel,
expectedError: nil,
expectedNodes: []string{"node1"},
},
"Gets all qualified nodes with preferred": {
callMocks: func(c *test.MockClient) {
nodeList := test.MockNodeList

nodeList.Items = append(nodeList.Items, deletedNode)

nodesFromOtherVendor := []corev1.Node{
{
ObjectMeta: v1.ObjectMeta{
Name: "node-p1",
Labels: map[string]string{
corev1.LabelInstanceTypeStable: "vendor1",
},
},
Status: corev1.NodeStatus{
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
{
ObjectMeta: v1.ObjectMeta{
Name: "node-p2",
Labels: map[string]string{
corev1.LabelInstanceTypeStable: "vendor2",
},
},
Status: corev1.NodeStatus{
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
},
},
},
},
{
ObjectMeta: v1.ObjectMeta{
Name: "node-p3",
Labels: map[string]string{
corev1.LabelInstanceTypeStable: "vendor1",
},
},
Status: corev1.NodeStatus{
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
DeletionTimestamp: &v1.Time{Time: time.Now()},
},
}
nodeList.Items = append(nodeList.Items, deletedNode)
nodeList.Items = append(nodeList.Items, nodesFromOtherVendor...)

relevantMap := c.CreateMapWithType(nodeList)
//insert node objects into the map
Expand All @@ -786,14 +865,15 @@ func TestGetAllQualifiedNodes(t *testing.T) {

c.On("List", mock.IsType(context.Background()), mock.IsType(&corev1.NodeList{}), mock.Anything).Return(nil)
},
workspace: test.MockWorkspaceWithPreferredNodes,
expectedError: nil,
expectedNodes: []string{"node1", "node-p1"},
},
}

for k, tc := range testcases {
t.Run(k, func(t *testing.T) {
mockClient := test.NewClient()
mockWorkspace := test.MockWorkspaceDistributedModel
reconciler := &WorkspaceReconciler{
Client: mockClient,
Scheme: test.NewTestScheme(),
Expand All @@ -802,15 +882,17 @@ func TestGetAllQualifiedNodes(t *testing.T) {

tc.callMocks(mockClient)

nodes, err := reconciler.getAllQualifiedNodes(ctx, mockWorkspace)
if tc.expectedError == nil {
assert.Check(t, err == nil, "Not expected to return error")
assert.Check(t, nodes != nil, "Response node array should not be nil")
assert.Check(t, len(nodes) == 1, "One out of three nodes should be qualified")
} else {
nodes, err := reconciler.getAllQualifiedNodes(ctx, tc.workspace)

if tc.expectedError != nil {
assert.Equal(t, tc.expectedError.Error(), err.Error())
assert.Check(t, nodes == nil, "Response node array should be nil")
return
}

assert.Check(t, err == nil, "Not expected to return error")
assert.Check(t, nodes != nil, "Response node array should not be nil")
assert.Check(t, len(nodes) == len(tc.expectedNodes), "Unexpected qualified nodes")
})
}
}
Expand Down
23 changes: 23 additions & 0 deletions pkg/utils/test/testUtils.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,29 @@ var (
},
},
}
MockWorkspaceWithPreferredNodes = &v1alpha1.Workspace{
ObjectMeta: metav1.ObjectMeta{
Name: "testWorkspace",
Namespace: "kaito",
},
Resource: v1alpha1.ResourceSpec{
Count: &gpuNodeCount,
InstanceType: "Standard_NC12s_v3",
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"apps": "test",
},
},
PreferredNodes: []string{"node-p1", "node-p2"},
},
Inference: &v1alpha1.InferenceSpec{
Preset: &v1alpha1.PresetSpec{
PresetMeta: v1alpha1.PresetMeta{
Name: "test-distributed-model",
},
},
},
}
)

var (
Expand Down
2 changes: 1 addition & 1 deletion presets/models/phi2/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## Supported Models
|Model name| Model source | Sample workspace|Kubernetes Workload|Distributed inference|
|----|:----:|:----:| :----: |:----: |
|phi-2 |[microsoft](https://huggingface.co/microsoft/phi-2)|[link](../../../examples/inference/kaito_workspace_phi_3.yaml)|Deployment| false|
|phi-2 |[microsoft](https://huggingface.co/microsoft/phi-2)|[link](../../../examples/inference/kaito_workspace_phi_2.yaml)|Deployment| false|


## Image Source
Expand Down
Loading