diff --git a/.gitignore b/.gitignore index 66fd13c9..4c01ff68 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,7 @@ # Dependency directories (remove the comment below to include it) # vendor/ + +# Ignore macOS and IDE specific files +.DS_Store +.idea/ diff --git a/resource-usage-collect-addon/.gitignore b/resource-usage-collect-addon/.gitignore index 4663271c..c434a82b 100644 --- a/resource-usage-collect-addon/.gitignore +++ b/resource-usage-collect-addon/.gitignore @@ -14,3 +14,7 @@ bin/ # Dependency directories (remove the comment below to include it) vendor/ + +# Ignore macOS and IDE specific files +.DS_Store +.idea/ diff --git a/resource-usage-collect-addon/Dockerfile b/resource-usage-collect-addon/Dockerfile index 47a86ba8..4fe763d9 100644 --- a/resource-usage-collect-addon/Dockerfile +++ b/resource-usage-collect-addon/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.21 AS builder +FROM golang:1.22 AS builder WORKDIR /go/src/open-cluster-management.io/addon-contrib/resource-usage-collect COPY . . ENV GO_PACKAGE open-cluster-management.io/addon-contrib/resource-usage-collect diff --git a/resource-usage-collect-addon/Makefile b/resource-usage-collect-addon/Makefile index 771729cf..be91020b 100644 --- a/resource-usage-collect-addon/Makefile +++ b/resource-usage-collect-addon/Makefile @@ -20,7 +20,7 @@ PWD=$(shell pwd) # Image URL to use all building/pushing image targets; GO_BUILD_PACKAGES :=./pkg/... IMAGE ?= resource-usage-collect-addon -IMAGE_REGISTRY ?= quay.io/haoqing +IMAGE_REGISTRY ?= zheshen IMAGE_TAG ?= latest IMAGE_NAME ?= $(IMAGE_REGISTRY)/$(IMAGE):$(IMAGE_TAG) @@ -43,7 +43,7 @@ vet: ## Run go vet against code. ##@ Build .PHONY: build build: fmt vet ## Build manager binary. - GOFLAGS="" go build -o addon ./pkg/addon/main.go ./pkg/addon/controller.go + GOFLAGS="" go build -o addon ./pkg/main.go .PHONY: images images: ## Build addon binary. diff --git a/resource-usage-collect-addon/README.md b/resource-usage-collect-addon/README.md index 106f735d..af934ecb 100644 --- a/resource-usage-collect-addon/README.md +++ b/resource-usage-collect-addon/README.md @@ -1,59 +1,86 @@ -# Prototype of extensible scheduling using resources usage. -We already support [extensible placement scheduling](https://github.com/open-cluster-management-io/enhancements/blob/main/enhancements/sig-architecture/32-extensiblescheduling/32-extensiblescheduling.md), which allows use of [addonplacementscore](https://github.com/open-cluster-management-io/enhancements/blob/main/enhancements/sig-architecture/32-extensiblescheduling/32-extensiblescheduling.md#addonplacementscore-api) to select clusters, but we lack an addonplacementscore that contains cluster resource usage information. +# Resource usage collect addon -In this repo, I developed an addon through addon-freamwork, this addon is mainly used to collect resource usage information on the cluster, and generate an addonplacementscore under the cluster namespace of the hub. +## Background -More details refer to [Extend the multicluster scheduling capabilities with placement](https://open-cluster-management.io/scenarios/extend-multicluster-scheduling-capabilities/) +With the rapid advancement of artificial intelligence, an increasing number of developers are required to schedule and plan AI/ML workloads based on available resources to achieve optimal performance and resource efficiency. + + +Open-Cluster-Management (OCM) has already implemented `Placement` and supports [extensible placement scheduling](https://github.com/open-cluster-management-io/enhancements/blob/main/enhancements/sig-architecture/32-extensiblescheduling/32-extensiblescheduling.md), which allows for advanced, customizable workload scheduling across clusters. The key components are: + +- `Placement`: This feature enables the dynamic selection of a set of `ManagedClusters` within one or more `ManagedClusterSets` to facilitate Multi-Cluster scheduling. +- `AddOnPlacementScore`: An API introduced by `Placement` to support scheduling based on customized scores. + +The `resource-usage-addon` is developed with `AddonTemplate`, and operates within this framework. +- Once installed on the hub cluster, the addon deploys an agent on each managed cluster. +- Agent pods on the managed clusters collect resource usage data and calculate a corresponding score. +- These scores are then used by `Placement` to inform cluster selection, ensuring workloads are deployed on clusters with the most appropriate available resources. + +This repository, developed as part of [Google Summer of Code 2024](https://github.com/open-cluster-management-io/ocm/issues/369), introduces enhancements to the `resource-usage-addon`, including new support for scheduling based on GPU and TPU resource availability. +This update is particularly valuable for developers seeking to optimize AI/ML workloads across multiple clusters. + + +REF: +- [GSoC 2024: Scheduling AI workload among multiple clusters #369](https://github.com/open-cluster-management-io/ocm/issues/369) +- [Extend the multicluster scheduling capabilities with placement](https://open-cluster-management.io/scenarios/extend-multicluster-scheduling-capabilities/) +- [What-is-an-addon](https://open-cluster-management.io/concepts/addon/#what-is-an-add-on) +- [What-is-a-placement](https://open-cluster-management.io/concepts/placement/#select-clusters-in-managedclusterset) +- [Enhancement:addontemplate](https://github.com/open-cluster-management-io/enhancements/tree/main/enhancements/sig-architecture/82-addon-template) # Quickstart -## Prepare -You have at least two running kubernetes cluster. One is the hub cluster, the other is managedcluster. +## Prerequisite +1. Follow the instructions on [OCM official website](https://open-cluster-management.io/getting-started/quick-start/), install `clusteradm` command-line tool and set up a hub (manager) cluster with two managed clusters. + If prefer using a different kubernetes distribution, follow the instructions in [Set-hub-and-managed-cluster](https://open-cluster-management.io/getting-started/quick-start/#setup-hub-and-managed-cluster). -You can create an ocm environment by running below command, which will create a hub and two managedclusters for you. +2. Command line tool `kubectl` installed. -```bash -curl -sSL https://raw.githubusercontent.com/open-cluster-management-io/OCM/main/solutions/setup-dev-environment/local-up.sh | bash -``` +3. [Docker](https://www.docker.com/) installed. ## Deploy -Set environment variables. +**Export `kubeconfig` file of your hub cluster.** ```bash export KUBECONFIG= # export KUBECONFIG=~/.kube/config ``` -Build the docker image to run the sample AddOn. +**Build the docker image to run the resource-usage-addon.** ```bash # build image -export IMAGE_NAME=quay.io/haoqing/resource-usage-collect-addon:latest +export IMAGE_NAME=zheshen/resource-usage-collect-addon-template:latest make images ``` -If your are using kind, load image into kind cluster. +**If you are using kind, load image to your hub cluster.** ```bash kind load docker-image $IMAGE_NAME --name cluster_name # kind load docker-image $IMAGE_NAME --name hub ``` -And then deploy the example AddOns controller on hub cluster. +**On the hub cluster, deploy the addon.** ```bash make deploy ``` -On the hub cluster, verify the resource-usage-collect-controller pod is running. -```bash -$ kubectl get pods -n open-cluster-management | grep resource-usage-collect-controller -resource-usage-collect-controller-55c58bbc5-t45dh 1/1 Running 0 71s -``` +## What's Next -## What is next +If deployed successfully: -After the deployment is complete, addon will create an addonplacementscore in its own namespace for each managedcluster in the hub. +On the hub cluster, you can see the `AddonTemplate`, and check the `ManagedClusterAddon` status. +```bash +$ kubectl get addontemplate +NAME ADDON NAME +resource-usage-collect resource-usage-collect + +$ kubectl get mca -A +NAMESPACE NAME AVAILABLE DEGRADED PROGRESSING +cluster1 resource-usage-collect True False +cluster2 resource-usage-collect True False +``` +After a short while, on the hub cluster, `AddonPlacementScore` for each managed cluster will be generated. ```bash $ kubectl config use kind-hub $ kubectl get addonplacementscore -A @@ -61,22 +88,38 @@ NAMESPACE NAME AGE cluster1 resource-usage-score 3m23s cluster2 resource-usage-score 3m24s ``` +### Resource Scoring Strategies + +#### Node Scope Score +- Node Scope Score: Indicates the available resources on the node with the most capacity in the cluster, aiding in selecting the best node for resource-intensive workloads. +- Code Representation: Represented as `cpuNodeAvailable`, `gpuNodeAvailable`, etc., indicating available CPU and GPU resources on specific nodes. + +#### Example Use Scenarios: +- Scenario: Suppose you have a cluster with three nodes: Node A with 2 available GPUs, Node B with 4 available GPUs, and Node C with 6 available GPUs. You need to deploy a job that requires 1 GPU. +- Scheduling Strategies: Using the Node Scope Score, specifically `gpuNodeAvailable`, the scheduler could identify Node A as the optimal node by choosing a lower `gpuNodeAvailable` for this job under a bin-packing strategy. The scheduler would prefer to place the job on Node A to keep Nodes B and C more available for future jobs that may require more resources. This approach minimizes fragmentation and ensures that larger jobs can be accommodated later. -### For example +#### Cluster Scope Score +- Cluster Scope Score reflects the total available resources across the entire cluster, helping to determine if the cluster can support additional workloads. +- Code Representation: Represented as `cpuClusterAvailable`, `gpuClusterAvailable`, etc., aggregating available resources across all nodes in the cluster. -Select a cluster with more available CPU. +#### Example Use Scenarios: +- Scenario: Consider a multi-cluster environment where Cluster X has 10 available GPUs across all nodes, Cluster Y has 6 available GPUs, and Cluster Z has 8 available GPUs. You need to deploy two jobs that first one requires 3 GPUs, and the other requires 4 GPUs. +- Scheduling Strategies: Using the Cluster Scope Score, specifically `gpuClusterAvailable`, the scheduler would prefer the first job Cluster X for the job because it has the most available GPU resource. Then the Cluster X's score becoming lower, the scheduler will then deploy the second job on Cluster Z. This ensures that workloads are spread out, maximizing resource utilization across clusters and avoiding overloading a single cluster. + +### Use Placement to select clusters +Consider this example use case: As a developer, I want to select a cluster with the most available GPU resources and deploy a job on it. Bind the default ManagedClusterSet to default Namespace. ```bash clusteradm clusterset bind default --namespace default ``` - +User could create a placement to select one cluster who has the most GPU resources. ```bash cat < MAXCPUCOUNT { - cpuScore = int64(MAXSCORE) - } else if availableCpu <= MINCPUCOUNT { - cpuScore = int64(MINSCORE) - } else { - cpuScore = int64(200*availableCpu/MAXCPUCOUNT - 100) +// Calculate the available resources in the cluster scope and return four scores for CPU, Memory, GPU, and TPU. +func (s *Score) calculateClusterScopeScore() (cpuScore int32, memScore int32, gpuScore int32, tpuScore int32, err error) { + // Get the total available CPU resources across the cluster. + cpuAvailable, err := s.calculateClusterAvailable(string(clusterv1.ResourceCPU)) + if err != nil { + return 0, 0, 0, 0, err + } + + // Get the total available Memory resources across the cluster. + memAvailable, err := s.calculateClusterAvailable(string(clusterv1.ResourceMemory)) + if err != nil { + return 0, 0, 0, 0, err } - availableMem := (memAlloc - memUsage) / (1024 * 1024) // MB - if availableMem > MAXMEMCOUNT { - memScore = int64(MAXSCORE) - } else if availableMem <= MINMEMCOUNT { - memScore = int64(MINSCORE) - } else { - memScore = int64(200*availableMem/MAXMEMCOUNT - 100) + // Get the total available GPU resources across the cluster. + gpuAvailable, err := s.calculateClusterAvailable(ResourceGPU) + if err != nil { + return 0, 0, 0, 0, err + } + + // Get the total available TPU resources across the cluster. + tpuAvailable, err := s.calculateClusterAvailable(ResourceTPU) + if err != nil { + return 0, 0, 0, 0, err } - klog.Infof("cpuScore = %v, memScore = %v", cpuScore, memScore) - return cpuScore, memScore, nil + // Normalize and return the scores based on available resources + return s.normalizeScore("cluster", cpuAvailable, memAvailable, gpuAvailable, tpuAvailable) } -func (s *Score) calculateClusterAllocateable(resourceName clusterv1.ResourceName) (float64, error) { +// Calculate the available resources in the cluster scope. +func (s *Score) calculateClusterAvailable(resourceName string) (float64, error) { nodes, err := s.nodeLister.List(labels.Everything()) if err != nil { return 0, err } - allocatableList := make(map[clusterv1.ResourceName]resource.Quantity) + var totalAllocatable float64 + var totalUsage float64 + for _, node := range nodes { if node.Spec.Unschedulable { continue } - for key, value := range node.Status.Allocatable { - if allocatable, exist := allocatableList[clusterv1.ResourceName(key)]; exist { - allocatable.Add(value) - allocatableList[clusterv1.ResourceName(key)] = allocatable - } else { - allocatableList[clusterv1.ResourceName(key)] = value - } + + // Accumulate allocatable resources from all nodes + alloc, exists := node.Status.Allocatable[v1.ResourceName(resourceName)] + if exists { + totalAllocatable += alloc.AsApproximateFloat64() + } + + // Calculate the resource usage for this node + usage, err := s.calculateNodeResourceUsage(node.Name, resourceName) + if err != nil { + return 0, err + } + totalUsage += usage + } + + // Calculate available resources + available := totalAllocatable - totalUsage + return available, nil +} + +// Normalize the score with the logic of ScoreNormaliser. +func (s *Score) normalizeScore(scope string, cpuAvailable, memAvailable, gpuAvailable, tpuAvailable float64) (cpuScore int32, memScore int32, gpuScore int32, tpuScore int32, err error) { + // Add a parameter that identifies whether the current scope is "cluster scope" or "node scope". + klog.Infof("[%s] cpuAvailable = %v, memAvailable = %v, gpuAvailable = %v, tpuAvailable = %v", scope, cpuAvailable, memAvailable, gpuAvailable, tpuAvailable) + + cpuScoreNormalizer := clustersdkv1alpha1.NewScoreNormalizer(MINCPUCOUNT, MAXCPUCOUNT) + cpuScore, err = cpuScoreNormalizer.Normalize(cpuAvailable) + if err != nil { + return 0, 0, 0, 0, err + } + + availableMem := memAvailable / 1024 * 1024 // MB + memScoreNormalizer := clustersdkv1alpha1.NewScoreNormalizer(MINMEMCOUNT, MAXMEMCOUNT) + memScore, err = memScoreNormalizer.Normalize(availableMem) + if err != nil { + return 0, 0, 0, 0, err + } + + gpuScoreNormalizer := clustersdkv1alpha1.NewScoreNormalizer(MINGPUCOUNT, MAXGPUCOUNT) + gpuScore, err = gpuScoreNormalizer.Normalize(gpuAvailable) + if err != nil { + return 0, 0, 0, 0, err + } + + tpuScoreNormalizer := clustersdkv1alpha1.NewScoreNormalizer(MINTPUCOUNT, MAXTPUCOUNT) + tpuScore, err = tpuScoreNormalizer.Normalize(tpuAvailable) + if err != nil { + return 0, 0, 0, 0, err + } + + klog.Infof("[%s] cpuScore = %v, memScore = %v, gpuScore = %v, tpuScore = %v", scope, cpuScore, memScore, gpuScore, tpuScore) + return cpuScore, memScore, gpuScore, tpuScore, nil +} + +// Find the node in the cluster that has the maximum available resources. +func (s *Score) calculateMaxAvailableNode(resourceName string) (float64, string, error) { + // Get the list of all Nodes, + nodes, err := s.nodeLister.List(labels.Everything()) + if err != nil { + return 0, "", err + } + var maxAvailable float64 + var maxNodeName string + // Iterate every node, calculate its available resources amount. + for _, node := range nodes { + if node.Spec.Unschedulable { + continue + } + alloc, exists := node.Status.Allocatable[v1.ResourceName(resourceName)] + if !exists { + continue + } + // Get the resource usage on this node. + usage, err := s.calculateNodeResourceUsage(node.Name, resourceName) + if err != nil { + return 0, "", err + } + // Calculate the actual amount of resources available. + available := alloc.AsApproximateFloat64() - usage + // Find the node with the maximum available resources. + if available > maxAvailable { + maxAvailable = available + maxNodeName = node.Name } } - quantity := allocatableList[resourceName] - return quantity.AsApproximateFloat64(), nil + klog.Infof("Max available %s: %f on node: %s", resourceName, maxAvailable, maxNodeName) + return maxAvailable, maxNodeName, nil } -func (s *Score) calculatePodResourceRequest(resourceName v1.ResourceName) (float64, error) { - list, err := s.podListener.List(labels.Everything()) +// Calculate the actual usage of a specific resource (e.g., GPU) by unfinished Pods on a given node. +func (s *Score) calculateNodeResourceUsage(nodeName string, resourceName string) (float64, error) { + // Get the list of all Pods. + list, err := s.podLister.List(labels.Everything()) if err != nil { return 0, err } var podRequest float64 - var podCount int for _, pod := range list { + // Only counts Pods dispatched to specific nodes. + if pod.Spec.NodeName != nodeName { + continue + } + + // Skip completed Pods or Pods that have released resources. + if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed || pod.DeletionTimestamp != nil { + continue + } + // Calculate resource requests for each container in the Pod. for i := range pod.Spec.Containers { container := &pod.Spec.Containers[i] value := s.getRequestForResource(resourceName, &container.Resources.Requests, !s.useRequested) podRequest += value } + // Calculate resource requests for the Init container. for i := range pod.Spec.InitContainers { initContainer := &pod.Spec.InitContainers[i] value := s.getRequestForResource(resourceName, &initContainer.Resources.Requests, !s.useRequested) @@ -132,34 +243,33 @@ func (s *Score) calculatePodResourceRequest(resourceName v1.ResourceName) (float // If Overhead is being utilized, add to the total requests for the pod if pod.Spec.Overhead != nil && s.enablePodOverhead { - if quantity, found := pod.Spec.Overhead[resourceName]; found { + if quantity, found := pod.Spec.Overhead[v1.ResourceName(resourceName)]; found { podRequest += quantity.AsApproximateFloat64() } } - podCount++ } return podRequest, nil } -func (s *Score) getRequestForResource(resource v1.ResourceName, requests *v1.ResourceList, nonZero bool) float64 { +func (s *Score) getRequestForResource(resource string, requests *v1.ResourceList, nonZero bool) float64 { if requests == nil { return 0 } switch resource { - case v1.ResourceCPU: + case string(v1.ResourceCPU): // Override if un-set, but not if explicitly set to zero if _, found := (*requests)[v1.ResourceCPU]; !found && nonZero { return 100 } return requests.Cpu().AsApproximateFloat64() - case v1.ResourceMemory: + case string(v1.ResourceMemory): // Override if un-set, but not if explicitly set to zero if _, found := (*requests)[v1.ResourceMemory]; !found && nonZero { return 200 * 1024 * 1024 } return requests.Memory().AsApproximateFloat64() default: - quantity, found := (*requests)[resource] + quantity, found := (*requests)[v1.ResourceName(resource)] if !found { return 0 } diff --git a/resource-usage-collect-addon/pkg/addon/agent/calculate_test.go b/resource-usage-collect-addon/pkg/addon/agent/calculate_test.go index d4737e9b..065d4257 100644 --- a/resource-usage-collect-addon/pkg/addon/agent/calculate_test.go +++ b/resource-usage-collect-addon/pkg/addon/agent/calculate_test.go @@ -12,70 +12,113 @@ import ( "k8s.io/client-go/kubernetes/fake" ) +// Test normalizeScore function. func TestNormalizeValue(t *testing.T) { cases := []struct { name string - cpuAlloc float64 - cpuUsage float64 - memAlloc float64 - memUsage float64 - expectCPUScore int64 - expectMemScore int64 + cpuAvailable float64 + memAvailable float64 + gpuAvailable float64 + tpuAvailable float64 + expectCPUScore int32 + expectMemScore int32 + expectGPUScore int32 + expectTPUScore int32 }{ { - name: "usage < alloc", - cpuAlloc: 70, - cpuUsage: 30, - memAlloc: 1024 * 1024 * 1024 * 1024, - memUsage: 1024 * 1024 * 1024 * 500, - expectCPUScore: -20, - expectMemScore: 2, - }, - { - name: "usage = alloc", - cpuAlloc: 70, - cpuUsage: 70, - memAlloc: 1024 * 1024 * 1024, - memUsage: 1024 * 1024 * 1024, + name: "usage = alloc", // Indicating that cpuAvailable, gpuAvailable etc. are all 0. + cpuAvailable: 0, + memAvailable: 0, + gpuAvailable: 0, + tpuAvailable: 0, expectCPUScore: -100, expectMemScore: -100, + expectGPUScore: -100, + expectTPUScore: -100, }, { - name: "usage > alloc", - cpuAlloc: 70, - cpuUsage: 80, - memAlloc: 1024 * 1024 * 1024 * 1024, - memUsage: 1024 * 1024 * 1024 * 1025, + name: "usage < alloc", // Indicating that cpuAvailable, gpuAvailable etc. are all positive. + cpuAvailable: 40, + memAvailable: 524 * 1024 * 1024 * 1024, + gpuAvailable: 2, + tpuAvailable: 1, + expectCPUScore: -20, + expectMemScore: 100, + expectGPUScore: -80, + expectTPUScore: -90, + }, + { + name: "usage > alloc", // Indicating that cpuAvailable, gpuAvailable etc. are all negative. + cpuAvailable: -10, + memAvailable: -1024 * 1024 * 1024, + gpuAvailable: -10, + tpuAvailable: -10, expectCPUScore: -100, expectMemScore: -100, + expectGPUScore: -100, + expectTPUScore: -100, }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { score := Score{} - cpuScore, memScore, err := score.normalizeScore(c.cpuAlloc, c.cpuUsage, c.memAlloc, c.memUsage) + cpuScore, memScore, gpuScore, tpuScore, err := score.normalizeScore("testScope", c.cpuAvailable, c.memAvailable, c.gpuAvailable, c.tpuAvailable) require.NoError(t, err) assert.Equal(t, c.expectCPUScore, cpuScore) assert.Equal(t, c.expectMemScore, memScore) + assert.Equal(t, c.expectGPUScore, gpuScore) + assert.Equal(t, c.expectTPUScore, tpuScore) }) } } -func TestCalculatePodResourceRequest(t *testing.T) { +// Test the calculation of resources across the cluster and on specific nodes +func TestCalculateClusterResources(t *testing.T) { + // Create testing nodes and pods. + node1 := &corev1.Node{ + ObjectMeta: v1.ObjectMeta{ + Name: "node1", + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("16"), + corev1.ResourceMemory: resource.MustParse("32Gi"), + corev1.ResourceName(ResourceGPU): resource.MustParse("6"), + }, + }, + } + + node2 := &corev1.Node{ + ObjectMeta: v1.ObjectMeta{ + Name: "node2", + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("32"), + corev1.ResourceMemory: resource.MustParse("64Gi"), + corev1.ResourceName(ResourceGPU): resource.MustParse("8"), + }, + }, + } + testPod := &corev1.Pod{ ObjectMeta: v1.ObjectMeta{ Name: "test", Namespace: "default", + // Mock Pod deployed in node2 + Labels: map[string]string{"name": "test"}, }, Spec: corev1.PodSpec{ + NodeName: "node2", Containers: []corev1.Container{ { Name: "test", Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("1Gi"), + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + corev1.ResourceName(ResourceGPU): resource.MustParse("2"), }, }, }, @@ -83,23 +126,27 @@ func TestCalculatePodResourceRequest(t *testing.T) { }, } - clientset := fake.NewSimpleClientset() + clientset := fake.NewSimpleClientset(node1, node2, testPod) informerFactory := informers.NewSharedInformerFactory(clientset, 0) podInformer := informerFactory.Core().V1().Pods() nodeInformer := informerFactory.Core().V1().Nodes() podInformer.Informer().GetStore().Add(testPod) + nodeInformer.Informer().GetStore().Add(node1) + nodeInformer.Informer().GetStore().Add(node2) s := NewScore(nodeInformer, podInformer) - cpuRequest, err := s.calculatePodResourceRequest(corev1.ResourceCPU) + // Test calculateClusterAvailable for GPUs + totalGPUAvailable, err := s.calculateClusterAvailable(ResourceGPU) require.NoError(t, err) - cpuExpected := 0.5 - assert.Equal(t, cpuExpected, cpuRequest) + // The cluster should have 12 GPUs available (6 from node1 + 6 from node2 after deducting 2 used by testPod). + assert.Equal(t, float64(12), totalGPUAvailable) - memoryRequest, err := s.calculatePodResourceRequest(corev1.ResourceMemory) + // Test calculateNodeResourceUsage for node2 + gpuUsage, err := s.calculateNodeResourceUsage("node2", ResourceGPU) require.NoError(t, err) - memoryExpected := float64(1073741824) // 1GiB - assert.Equal(t, memoryExpected, memoryRequest) + // Expect testPod on node2 to use 2 GPUs. + assert.Equal(t, float64(2), gpuUsage) } diff --git a/resource-usage-collect-addon/pkg/addon/controller.go b/resource-usage-collect-addon/pkg/addon/controller.go deleted file mode 100644 index a4a9a9f3..00000000 --- a/resource-usage-collect-addon/pkg/addon/controller.go +++ /dev/null @@ -1,126 +0,0 @@ -package main - -import ( - "context" - "embed" - "fmt" - "os" - - addonagent "open-cluster-management.io/addon-contrib/resource-usage-collect-addon/pkg/addon/agent" - - "github.com/openshift/library-go/pkg/assets" - "github.com/openshift/library-go/pkg/operator/events" - "github.com/openshift/library-go/pkg/operator/resource/resourceapply" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/serializer" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "open-cluster-management.io/addon-framework/pkg/addonfactory" - "open-cluster-management.io/addon-framework/pkg/agent" - "open-cluster-management.io/addon-framework/pkg/utils" - addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" - clusterv1 "open-cluster-management.io/api/cluster/v1" -) - -var ( - genericScheme = runtime.NewScheme() - genericCodecs = serializer.NewCodecFactory(genericScheme) - genericCodec = genericCodecs.UniversalDeserializer() -) - -const ( - defaultExampleImage = "quay.io/open-cluster-management/resource-usage-collect-addon:v0.1" - addonName = "resource-usage-collect" -) - -//go:embed manifests -//go:embed manifests/templates -var fs embed.FS - -var agentPermissionFiles = []string{ - // role with RBAC rules to access resources on hub - "manifests/permission/role.yaml", - // rolebinding to bind the above role to a certain user group - "manifests/permission/rolebinding.yaml", -} - -func newRegistrationOption(kubeConfig *rest.Config, recorder events.Recorder, agentName string) *agent.RegistrationOption { - return &agent.RegistrationOption{ - CSRConfigurations: agent.KubeClientSignerConfigurations(addonName, agentName), - CSRApproveCheck: utils.DefaultCSRApprover(agentName), - PermissionConfig: func(cluster *clusterv1.ManagedCluster, addon *addonapiv1alpha1.ManagedClusterAddOn) error { - kubeclient, err := kubernetes.NewForConfig(kubeConfig) - if err != nil { - return err - } - - for _, file := range agentPermissionFiles { - if err := applyManifestFromFile(file, cluster.Name, addon.Name, kubeclient, recorder); err != nil { - return err - } - } - - return nil - }, - } -} - -func applyManifestFromFile(file, clusterName, addonName string, kubeclient *kubernetes.Clientset, recorder events.Recorder) error { - groups := agent.DefaultGroups(clusterName, addonName) - config := struct { - ClusterName string - Group string - }{ - ClusterName: clusterName, - Group: groups[0], - } - - results := resourceapply.ApplyDirectly(context.Background(), - resourceapply.NewKubeClientHolder(kubeclient), - recorder, - resourceapply.NewResourceCache(), - func(name string) ([]byte, error) { - template, err := fs.ReadFile(file) - if err != nil { - return nil, err - } - return assets.MustCreateAssetFromTemplate(name, template, config).Data, nil - }, - file, - ) - - for _, result := range results { - if result.Error != nil { - return result.Error - } - } - - return nil -} - -func getValues(cluster *clusterv1.ManagedCluster, - addon *addonapiv1alpha1.ManagedClusterAddOn) (addonfactory.Values, error) { - installNamespace := addon.Spec.InstallNamespace - if len(installNamespace) == 0 { - installNamespace = addonagent.AgentInstallationNamespace - } - - image := os.Getenv("IMAGE_NAME") - if len(image) == 0 { - image = defaultExampleImage - } - - manifestConfig := struct { - KubeConfigSecret string - ClusterName string - AddonInstallNamespace string - Image string - }{ - KubeConfigSecret: fmt.Sprintf("%s-hub-kubeconfig", addon.Name), - AddonInstallNamespace: installNamespace, - ClusterName: cluster.Name, - Image: image, - } - - return addonfactory.StructToValues(manifestConfig), nil -} diff --git a/resource-usage-collect-addon/pkg/addon/main.go b/resource-usage-collect-addon/pkg/addon/main.go deleted file mode 100644 index f51cf551..00000000 --- a/resource-usage-collect-addon/pkg/addon/main.go +++ /dev/null @@ -1,110 +0,0 @@ -package main - -import ( - "context" - "fmt" - "math/rand" - "os" - "time" - - "k8s.io/klog/v2" - - goflag "flag" - - "github.com/openshift/library-go/pkg/controller/controllercmd" - "github.com/spf13/cobra" - "github.com/spf13/pflag" - utilflag "k8s.io/component-base/cli/flag" - "k8s.io/component-base/logs" - "open-cluster-management.io/addon-contrib/resource-usage-collect-addon/pkg/addon/agent" - "open-cluster-management.io/addon-framework/pkg/addonfactory" - addonagent "open-cluster-management.io/addon-framework/pkg/agent" - "open-cluster-management.io/addon-framework/pkg/version" - - utilrand "k8s.io/apimachinery/pkg/util/rand" - "open-cluster-management.io/addon-framework/pkg/addonmanager" -) - -func main() { - rand.Seed(time.Now().UTC().UnixNano()) - - pflag.CommandLine.SetNormalizeFunc(utilflag.WordSepNormalizeFunc) - pflag.CommandLine.AddGoFlagSet(goflag.CommandLine) - - logs.InitLogs() - defer logs.FlushLogs() - - command := newCommand() - if err := command.Execute(); err != nil { - fmt.Fprintf(os.Stderr, "%v\n", err) - os.Exit(1) - } -} - -func newCommand() *cobra.Command { - cmd := &cobra.Command{ - Use: "addon", - Short: "resource usage collection addon", - Run: func(cmd *cobra.Command, args []string) { - if err := cmd.Help(); err != nil { - fmt.Fprintf(os.Stderr, "%v\n", err) - } - os.Exit(1) - }, - } - - if v := version.Get().String(); len(v) == 0 { - cmd.Version = "" - } else { - cmd.Version = v - } - - cmd.AddCommand(newControllerCommand()) - cmd.AddCommand(agent.NewAgentCommand(addonName)) - - return cmd -} - -func newControllerCommand() *cobra.Command { - cmd := controllercmd. - NewControllerCommandConfig("resource-usage-collection-addon-controller", version.Get(), runController). - NewCommand() - cmd.Use = "controller" - cmd.Short = "Start the addon controller" - - return cmd -} - -func runController(ctx context.Context, controllerContext *controllercmd.ControllerContext) error { - mgr, err := addonmanager.New(controllerContext.KubeConfig) - if err != nil { - return err - } - registrationOption := newRegistrationOption( - controllerContext.KubeConfig, - controllerContext.EventRecorder, - utilrand.String(5)) - - agentAddon, err := addonfactory.NewAgentAddonFactory(addonName, fs, "manifests/templates"). - WithGetValuesFuncs(getValues, addonfactory.GetValuesFromAddonAnnotation). - WithAgentRegistrationOption(registrationOption). - WithInstallStrategy(addonagent.InstallAllStrategy(agent.AgentInstallationNamespace)). - BuildTemplateAgentAddon() - if err != nil { - klog.Errorf("failed to build agent %v", err) - return err - } - - err = mgr.AddAgent(agentAddon) - if err != nil { - klog.Fatal(err) - } - - err = mgr.Start(ctx) - if err != nil { - klog.Fatal(err) - } - <-ctx.Done() - - return nil -} diff --git a/resource-usage-collect-addon/pkg/addon/manifests/permission/role.yaml b/resource-usage-collect-addon/pkg/addon/manifests/permission/role.yaml deleted file mode 100644 index 0ab03c6b..00000000 --- a/resource-usage-collect-addon/pkg/addon/manifests/permission/role.yaml +++ /dev/null @@ -1,16 +0,0 @@ -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: open-cluster-management:resource-usage-collect:agent - namespace: {{ .ClusterName }} -rules: - - apiGroups: [""] - resources: ["nodes","configmaps", "pods"] - verbs: ["get", "list", "watch"] - - apiGroups: ["cluster.open-cluster-management.io"] - resources: ["addonplacementscores"] - verbs: ["get", "list", "watch", "create", "update", "delete", "deletecollection", "patch"] - - apiGroups: [ "cluster.open-cluster-management.io" ] - resources: [ "addonplacementscores/status" ] - verbs: [ "update", "patch" ] - diff --git a/resource-usage-collect-addon/pkg/addon/manifests/permission/rolebinding.yaml b/resource-usage-collect-addon/pkg/addon/manifests/permission/rolebinding.yaml deleted file mode 100644 index 5416dee7..00000000 --- a/resource-usage-collect-addon/pkg/addon/manifests/permission/rolebinding.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: open-cluster-management:resource-usage-collect:agent - namespace: {{ .ClusterName }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: open-cluster-management:resource-usage-collect:agent -subjects: - - kind: Group - apiGroup: rbac.authorization.k8s.io - name: {{ .Group }} diff --git a/resource-usage-collect-addon/pkg/addon/manifests/templates/clusterrole.yaml b/resource-usage-collect-addon/pkg/addon/manifests/templates/clusterrole.yaml deleted file mode 100644 index bf219312..00000000 --- a/resource-usage-collect-addon/pkg/addon/manifests/templates/clusterrole.yaml +++ /dev/null @@ -1,11 +0,0 @@ - kind: ClusterRole - apiVersion: rbac.authorization.k8s.io/v1 - metadata: - name: resource-usage-collect-agent - rules: - - apiGroups: [""] - resources: ["nodes","configmaps", "pods", "events"] - verbs: ["get", "list", "watch", "create", "update", "delete", "deletecollection", "patch"] - - apiGroups: ["coordination.k8s.io"] - resources: ["leases"] - verbs: ["create", "get", "list", "update", "watch", "patch"] diff --git a/resource-usage-collect-addon/pkg/addon/manifests/templates/clusterrolebinding.yaml b/resource-usage-collect-addon/pkg/addon/manifests/templates/clusterrolebinding.yaml deleted file mode 100644 index 7b897af1..00000000 --- a/resource-usage-collect-addon/pkg/addon/manifests/templates/clusterrolebinding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: resource-usage-collect-agent -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: resource-usage-collect-agent -subjects: - - kind: ServiceAccount - name: resource-usage-collect-agent-sa - namespace: {{ .AddonInstallNamespace }} diff --git a/resource-usage-collect-addon/pkg/addon/manifests/templates/deployment.yaml b/resource-usage-collect-addon/pkg/addon/manifests/templates/deployment.yaml deleted file mode 100644 index b8d33996..00000000 --- a/resource-usage-collect-addon/pkg/addon/manifests/templates/deployment.yaml +++ /dev/null @@ -1,35 +0,0 @@ -kind: Deployment -apiVersion: apps/v1 -metadata: - name: resource-usage-collect-agent - namespace: {{ .AddonInstallNamespace }} - labels: - app: resource-usage-collect-agent -spec: - replicas: 1 - selector: - matchLabels: - app: resource-usage-collect-agent - template: - metadata: - labels: - app: resource-usage-collect-agent - spec: - serviceAccountName: resource-usage-collect-agent-sa - volumes: - - name: hub-config - secret: - secretName: {{ .KubeConfigSecret }} - containers: - - name: resource-usage-collect-agent - image: {{ .Image }} - imagePullPolicy: Always - args: - - "/addon" - - "agent" - - "--hub-kubeconfig=/var/run/hub/kubeconfig" - - "--cluster-name={{ .ClusterName }}" - - "--addon-namespace={{ .AddonInstallNamespace }}" - volumeMounts: - - name: hub-config - mountPath: /var/run/hub diff --git a/resource-usage-collect-addon/pkg/addon/manifests/templates/serviceaccount.yaml b/resource-usage-collect-addon/pkg/addon/manifests/templates/serviceaccount.yaml deleted file mode 100644 index 12c1ae11..00000000 --- a/resource-usage-collect-addon/pkg/addon/manifests/templates/serviceaccount.yaml +++ /dev/null @@ -1,5 +0,0 @@ -kind: ServiceAccount -apiVersion: v1 -metadata: - name: resource-usage-collect-agent-sa - namespace: {{ .AddonInstallNamespace }} diff --git a/resource-usage-collect-addon/pkg/addon/doc.go b/resource-usage-collect-addon/pkg/doc.go similarity index 100% rename from resource-usage-collect-addon/pkg/addon/doc.go rename to resource-usage-collect-addon/pkg/doc.go diff --git a/resource-usage-collect-addon/pkg/main.go b/resource-usage-collect-addon/pkg/main.go new file mode 100644 index 00000000..7675edcc --- /dev/null +++ b/resource-usage-collect-addon/pkg/main.go @@ -0,0 +1,52 @@ +package main + +import ( + goflag "flag" + "fmt" + "github.com/spf13/cobra" + "github.com/spf13/pflag" + utilflag "k8s.io/component-base/cli/flag" + "k8s.io/component-base/logs" + "math/rand" + "open-cluster-management.io/addon-contrib/resource-usage-collect-addon/pkg/addon/agent" + "open-cluster-management.io/addon-framework/pkg/version" + "os" + "time" +) + +func main() { + rand.Seed(time.Now().UTC().UnixNano()) + + pflag.CommandLine.SetNormalizeFunc(utilflag.WordSepNormalizeFunc) + pflag.CommandLine.AddGoFlagSet(goflag.CommandLine) + + logs.InitLogs() + defer logs.FlushLogs() + + command := newCommand() + if err := command.Execute(); err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + os.Exit(1) + } +} + +func newCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "addon", + Short: "resource usage collection addon", + Run: func(cmd *cobra.Command, args []string) { + if err := cmd.Help(); err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + } + os.Exit(1) + }, + } + + if v := version.Get().String(); len(v) == 0 { + cmd.Version = "" + } else { + cmd.Version = v + } + cmd.AddCommand(agent.NewAgentCommand("resource-usage-collect-addon")) + return cmd +}