Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consumed capacity info in SpaceProvisionerConfig #1109

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
067308a
Make the SPC ready status reflect the ability to place spaces to
metlos Nov 28, 2024
8470d2b
fix small typos and improve the wording in comments
metlos Nov 29, 2024
f30cec6
Address linter complaints.
metlos Nov 29, 2024
a6b7053
Merge branch 'master' into member-info-in-spc-status-2
mfrancisc Dec 2, 2024
2130e70
return error when no ToolchainStatus is found
metlos Dec 3, 2024
466f212
Rename the tests to better reflect what they're doing
metlos Dec 3, 2024
8450dd9
Don't log the error that will be logged again by the callers
metlos Dec 3, 2024
a0b15c8
better naming and explanation why we're doing only a partial re-check…
metlos Dec 3, 2024
1b9a606
Merge remote-tracking branch 'origin/member-info-in-spc-status-2' int…
metlos Dec 3, 2024
691752e
Add tests for the GetSpaceCountsFromCountsCache.
metlos Dec 3, 2024
d6db400
Clean up comments
metlos Dec 5, 2024
3b1b557
add a testcase for multiple memory usage threshold breaches
metlos Dec 5, 2024
9e27aa8
Merge remote-tracking branch 'origin/member-info-in-spc-status-2' int…
metlos Dec 5, 2024
136c8ed
Fix typos
metlos Dec 5, 2024
7c935b5
remove GetUsageFunc from the SPC, always read the ToolchainStatus
metlos Dec 6, 2024
0bebee7
Make sure the mappers are used with the correct object type
metlos Dec 6, 2024
f97d687
reset the env var to its original value after the test
metlos Dec 6, 2024
087b3bd
Move the update of the space count from the predicate up a level for it
metlos Dec 6, 2024
5972be2
Move the GetSpaceCountFromSpaceProvisionerConfigs helper function into
metlos Dec 6, 2024
226ca14
Simplify the capacity ready state computation.
metlos Dec 6, 2024
845ea69
Merge remote-tracking branch 'origin/member-info-in-spc-status-2' int…
metlos Dec 6, 2024
db8505f
Mirror the consumed capacity recorded in the ToolchainStatus to
metlos Dec 6, 2024
499ef81
Merge branch 'master' into member-info-in-spc-status_controller-part
MatousJobanek Dec 12, 2024
68d6be5
Simplify the logic in refreshStatus of the SPC.
metlos Dec 12, 2024
9a0e4f6
Break up long assertions in the test
metlos Dec 12, 2024
b058ec5
Merge remote-tracking branch 'upstream/master' into member-info-in-sp…
metlos Dec 16, 2024
6ffd1e5
Merge branch 'master' into member-info-in-spc-status_controller-part
metlos Dec 18, 2024
90cf19a
Merge branch 'master' into member-info-in-spc-status_controller-part
fbm3307 Dec 18, 2024
a514b81
Merge branch 'master' into member-info-in-spc-status_controller-part
metlos Dec 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions controllers/spaceprovisionerconfig/mapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,40 @@

import (
"context"

toolchainv1alpha1 "github.com/codeready-toolchain/api/api/v1alpha1"
"k8s.io/apimachinery/pkg/types"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

func MapToolchainClusterToSpaceProvisionerConfigs(ctx context.Context, cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
return func(context context.Context, obj runtimeclient.Object) []reconcile.Request {
func MapToolchainClusterToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
fbm3307 marked this conversation as resolved.
Show resolved Hide resolved
return func(ctx context.Context, obj runtimeclient.Object) []reconcile.Request {
if _, ok := obj.(*toolchainv1alpha1.ToolchainCluster); !ok {
return nil
}

Check warning on line 17 in controllers/spaceprovisionerconfig/mapper.go

View check run for this annotation

Codecov / codecov/patch

controllers/spaceprovisionerconfig/mapper.go#L16-L17

Added lines #L16 - L17 were not covered by tests

ret, err := findReferencingProvisionerConfigs(ctx, cl, runtimeclient.ObjectKeyFromObject(obj))
if err != nil {
log.FromContext(ctx).Error(err, "failed to list SpaceProvisionerConfig objects while determining what objects to reconcile",
"toolchainClusterCause", runtimeclient.ObjectKeyFromObject(obj))
"causeObj", runtimeclient.ObjectKeyFromObject(obj), "causeKind", "ToolchainCluster")
return []reconcile.Request{}
}
return ret
}
}

func MapToolchainStatusToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
fbm3307 marked this conversation as resolved.
Show resolved Hide resolved
return func(ctx context.Context, obj runtimeclient.Object) []reconcile.Request {
if _, ok := obj.(*toolchainv1alpha1.ToolchainStatus); !ok {
return nil
}

Check warning on line 33 in controllers/spaceprovisionerconfig/mapper.go

View check run for this annotation

Codecov / codecov/patch

controllers/spaceprovisionerconfig/mapper.go#L32-L33

Added lines #L32 - L33 were not covered by tests

ret, err := findAllSpaceProvisionerConfigsInNamespace(ctx, cl, obj.GetNamespace())
if err != nil {
log.FromContext(ctx).Error(err, "failed to list SpaceProvisionerConfig objects while determining what objects to reconcile",
"causeObj", runtimeclient.ObjectKeyFromObject(obj), "causeKind", "ToolchainStatus")
return []reconcile.Request{}
}
return ret
Expand All @@ -39,3 +60,20 @@
}
return ret, nil
}

func findAllSpaceProvisionerConfigsInNamespace(ctx context.Context, cl runtimeclient.Client, ns string) ([]reconcile.Request, error) {
configs := &toolchainv1alpha1.SpaceProvisionerConfigList{}
if err := cl.List(ctx, configs, runtimeclient.InNamespace(ns)); err != nil {
return nil, err
}
ret := make([]reconcile.Request, 0, len(configs.Items))
for _, cfg := range configs.Items {
ret = append(ret, reconcile.Request{
NamespacedName: types.NamespacedName{
Namespace: cfg.Namespace,
Name: cfg.Name,
},
})
}
return ret, nil
}
51 changes: 49 additions & 2 deletions controllers/spaceprovisionerconfig/mapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
cl := test.NewFakeClient(t, spc0, spc1, spc2)

// when
reqs := MapToolchainClusterToSpaceProvisionerConfigs(context.TODO(), cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
reqs := MapToolchainClusterToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "cluster1",
Namespace: test.HostOperatorNs,
Expand All @@ -123,7 +123,7 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
}

// when
reqs := MapToolchainClusterToSpaceProvisionerConfigs(context.TODO(), cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
reqs := MapToolchainClusterToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "cluster1",
Namespace: test.HostOperatorNs,
Expand All @@ -134,3 +134,50 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
require.Empty(t, reqs)
})
}

func TestMapToolchainStatusToSpaceProvisionerConfig(t *testing.T) {
t.Run("finds all SPCs in namespace", func(t *testing.T) {
// given
spc0 := NewSpaceProvisionerConfig("spc0", test.HostOperatorNs, ReferencingToolchainCluster("cluster1"))
spc1 := NewSpaceProvisionerConfig("spc1", test.HostOperatorNs, ReferencingToolchainCluster("cluster2"))
spc2 := NewSpaceProvisionerConfig("spc2", test.HostOperatorNs, ReferencingToolchainCluster("cluster1"))
cl := test.NewFakeClient(t, spc0, spc1, spc2)

// when
reqs := MapToolchainStatusToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainStatus{
ObjectMeta: metav1.ObjectMeta{
Name: "toolchain-status",
Namespace: test.HostOperatorNs,
},
})

// then
require.Equal(t, []reconcile.Request{
requestFromObject(spc0),
requestFromObject(spc1),
requestFromObject(spc2),
}, reqs)
})
t.Run("interprets erors as empty result", func(t *testing.T) {
// given
cl := test.NewFakeClient(t, NewSpaceProvisionerConfig("spc0", test.HostOperatorNs, ReferencingToolchainCluster("cluster1")))
expectedErr := errors.New("expected list error")
cl.MockList = func(ctx context.Context, list runtimeclient.ObjectList, opts ...runtimeclient.ListOption) error {
if _, ok := list.(*toolchainv1alpha1.SpaceProvisionerConfigList); ok {
return expectedErr
}
return cl.Client.List(ctx, list, opts...)
}

// when
reqs := MapToolchainStatusToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainStatus{
ObjectMeta: metav1.ObjectMeta{
Name: "toolchain-status",
Namespace: test.HostOperatorNs,
},
})

// then
require.Empty(t, reqs)
})
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
"fmt"

toolchainv1alpha1 "github.com/codeready-toolchain/api/api/v1alpha1"
"github.com/codeready-toolchain/host-operator/controllers/toolchainconfig"
"github.com/codeready-toolchain/toolchain-common/pkg/condition"
"github.com/redhat-cop/operator-utils/pkg/util"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -25,19 +27,28 @@

var _ reconcile.Reconciler = (*Reconciler)(nil)

func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {

Check warning on line 30 in controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go#L30

Added line #L30 was not covered by tests
return ctrl.NewControllerManagedBy(mgr).
For(&toolchainv1alpha1.SpaceProvisionerConfig{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Watches(
&toolchainv1alpha1.ToolchainCluster{},
handler.EnqueueRequestsFromMapFunc(MapToolchainClusterToSpaceProvisionerConfigs(ctx, r.Client)),
handler.EnqueueRequestsFromMapFunc(MapToolchainClusterToSpaceProvisionerConfigs(r.Client)),
).
// we use the same information as the ToolchainStatus specific for the SPCs. Because memory consumption is
// read directly out of the member clusters using remote connections, let's look for it only once
// in ToolchainStatus and just read it out "locally" here without needing to reach out to the member clusters
// again.
Watches(
&toolchainv1alpha1.ToolchainStatus{},
handler.EnqueueRequestsFromMapFunc(MapToolchainStatusToSpaceProvisionerConfigs(r.Client)),

Check warning on line 43 in controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go#L35-L43

Added lines #L35 - L43 were not covered by tests
).
Complete(r)
}

//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=spaceprovisionerconfigs,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=spaceprovisionerconfigs/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=toolchainclusters,verbs=get;list;watch
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=toolchainstatuses,verbs=get;list;watch

// Reconcile ensures that SpaceProvisionerConfig is valid and points to an existing ToolchainCluster.
func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) {
Expand All @@ -58,62 +69,153 @@
return ctrl.Result{}, nil
}

readyCondition, reportedError := r.determineReadyState(ctx, spaceProvisionerConfig)
reportedErr := r.refreshStatus(ctx, spaceProvisionerConfig)

var updated bool
spaceProvisionerConfig.Status.Conditions, updated = condition.AddOrUpdateStatusConditions(spaceProvisionerConfig.Status.Conditions,
readyCondition)
if !updated {
return ctrl.Result{}, reportedError
if err := r.Client.Status().Update(ctx, spaceProvisionerConfig); err != nil {
return ctrl.Result{}, err
}

logger.Info("updating SpaceProvisionerConfig", "readyCondition", readyCondition)
if err := r.Client.Status().Update(ctx, spaceProvisionerConfig); err != nil {
if reportedError != nil {
logger.Info("failed to update the status (reported as failed reconciliation) with a previous unreported error during reconciliation", "unreportedError", reportedError)
return ctrl.Result{}, reportedErr
}

func (r *Reconciler) refreshStatus(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) error {
// clear out the consumed capacity - this will advertise to the user that we either failed before it made sense
// to collect it (and therefore we don't know it) or it was not available (and therefore we again don't know it)
spc.Status.ConsumedCapacity = nil

if !spc.Spec.Enabled {
updateReadyCondition(spc, corev1.ConditionFalse, toolchainv1alpha1.SpaceProvisionerConfigDisabledReason, "")
return nil
}

clusterCondition, err := r.determineClusterReadyState(ctx, spc)
if err != nil {
updateReadyCondition(spc, clusterCondition, toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotFoundReason, err.Error())

// the reconciler reacts on ToolchainCluster changes so it will be triggered once a new TC appears
// we therefore don't need to return error from the reconciler in the case the TC is not found.
if errors.IsNotFound(err) {
return nil
}
reportedError = fmt.Errorf("failed to update the SpaceProvisionerConfig status: %w", err)
return err
}

if clusterCondition != corev1.ConditionTrue {
updateReadyCondition(spc, clusterCondition, toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotReadyReason, "")
return nil
}

return ctrl.Result{}, reportedError
spc.Status.ConsumedCapacity, err = collectConsumedCapacity(ctx, r.Client, spc.Spec.ToolchainCluster, spc.Namespace)
if err != nil {
updateReadyCondition(spc, corev1.ConditionUnknown, toolchainv1alpha1.SpaceProvisionerConfigFailedToDetermineCapacityReason, err.Error())
return err
}

Check warning on line 112 in controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go#L110-L112

Added lines #L110 - L112 were not covered by tests

capacityCondition := r.determineCapacityReadyState(spc)

reason := toolchainv1alpha1.SpaceProvisionerConfigValidReason
if capacityCondition != corev1.ConditionTrue {
reason = toolchainv1alpha1.SpaceProvisionerConfigInsufficientCapacityReason
}

updateReadyCondition(spc, capacityCondition, reason, "")

return nil
}

func (r *Reconciler) determineReadyState(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) (toolchainv1alpha1.Condition, error) {
toolchainCluster := &toolchainv1alpha1.ToolchainCluster{}
toolchainClusterKey := runtimeclient.ObjectKey{Name: spc.Spec.ToolchainCluster, Namespace: spc.Namespace}
var toolchainPresent corev1.ConditionStatus
toolchainPresenceReason := toolchainv1alpha1.SpaceProvisionerConfigValidReason
var reportedError error
toolchainPresenceMessage := ""
if err := r.Client.Get(ctx, toolchainClusterKey, toolchainCluster); err != nil {
if !errors.IsNotFound(err) {
// we need to requeue the reconciliation in this case because we cannot be sure whether the ToolchainCluster
// is really present in the cluster or not. If we did not do that and instead just reported the error in
// the status, we could eventually leave the SPC in an incorrect state once the error condition in the cluster,
// that prevents us from reading the ToolchainCluster, clears. I.e. we need the requeue to keep the promise
// of eventual consistency.

reportedError = fmt.Errorf("failed to get the referenced ToolchainCluster: %w", err)
toolchainPresenceMessage = reportedError.Error()
// Note that this function merely mirrors the usage information found in the ToolchainStatus. This means that it actually may work
// with slightly stale data because the counter.Counts cache might not have been synced yet. This is ok though because the capacity manager
// doesn't completely rely on the readiness status of the SPC and will re-evaluate the decision taking into the account the contents of
// the counter cache and therefore completely "fresh" data.
func collectConsumedCapacity(ctx context.Context, cl runtimeclient.Client, clusterName string, toolchainStatusNs string) (*toolchainv1alpha1.ConsumedCapacity, error) {
status := &toolchainv1alpha1.ToolchainStatus{}
if err := cl.Get(ctx, types.NamespacedName{Namespace: toolchainStatusNs, Name: toolchainconfig.ToolchainStatusName}, status); err != nil {
return nil, fmt.Errorf("unable to read ToolchainStatus resource: %w", err)
}

for _, m := range status.Status.Members {
if m.ClusterName == clusterName {
cc := toolchainv1alpha1.ConsumedCapacity{}
cc.MemoryUsagePercentPerNodeRole = m.MemberStatus.ResourceUsage.MemoryUsagePerNodeRole
cc.SpaceCount = m.SpaceCount

return &cc, nil
}
toolchainPresenceReason = toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotFoundReason
toolchainPresent = corev1.ConditionFalse
} else {
readyCond, found := condition.FindConditionByType(toolchainCluster.Status.Conditions, toolchainv1alpha1.ConditionReady)
if !found {
toolchainPresent = corev1.ConditionFalse
} else {
toolchainPresent = readyCond.Status
}

return nil, nil
}

func (r *Reconciler) determineClusterReadyState(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) (corev1.ConditionStatus, error) {
toolchainCluster := &toolchainv1alpha1.ToolchainCluster{}
if err := r.Client.Get(ctx, runtimeclient.ObjectKey{Name: spc.Spec.ToolchainCluster, Namespace: spc.Namespace}, toolchainCluster); err != nil {
if errors.IsNotFound(err) {
return corev1.ConditionFalse, err
}
if toolchainPresent != corev1.ConditionTrue {
toolchainPresenceReason = toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotReadyReason
// IsNotFound is self-explanatory but let's add a little bit of context to the error in other cases
return corev1.ConditionFalse, fmt.Errorf("failed to get the referenced ToolchainCluster: %w", err)
}

readyCond, found := condition.FindConditionByType(toolchainCluster.Status.Conditions, toolchainv1alpha1.ConditionReady)
if !found {
return corev1.ConditionFalse, nil
}

Check warning on line 162 in controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/spaceprovisionerconfig/spaceprovisionerconfig_controller.go#L161-L162

Added lines #L161 - L162 were not covered by tests

return readyCond.Status, nil
}

func (r *Reconciler) determineCapacityReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) corev1.ConditionStatus {
if spc.Status.ConsumedCapacity == nil {
// we don't know anything about the resource consumption in the member
return corev1.ConditionUnknown
}

// the cluster capacity is ok if it has room for additional spaces and enough free memory

roomForAdditionalSpaces := determineSpaceCountReadyState(spc)
if !roomForAdditionalSpaces {
return corev1.ConditionFalse
}

return determineMemoryUtilizationReadyState(spc)
}

// determineSpaceCountReadyState checks that there is room for additional spaces in the cluster.
// It always knows this fact so returning a bool is ok, in contrast to determinMemoryUtilizationReadyState.
func determineSpaceCountReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) bool {
max := spc.Spec.CapacityThresholds.MaxNumberOfSpaces
return max == 0 || max > uint(spc.Status.ConsumedCapacity.SpaceCount)
}

// determineMemoryUtilizationReadyState checks that the cluster has enough free memory. It may not be able to tell the fact
// if the SPC doesn't contain memory usage information in the status. It therefore can return true, false or
// unknown condition values.
func determineMemoryUtilizationReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) corev1.ConditionStatus {
if spc.Spec.CapacityThresholds.MaxMemoryUtilizationPercent == 0 {
// 0 max memory utilization means no limit
return corev1.ConditionTrue
}

if len(spc.Status.ConsumedCapacity.MemoryUsagePercentPerNodeRole) == 0 {
// we don't know the memory utilization in the member
return corev1.ConditionUnknown
}

// the memory utilitzation is ok if it is below the threshold in all node types
for _, val := range spc.Status.ConsumedCapacity.MemoryUsagePercentPerNodeRole {
if uint(val) >= spc.Spec.CapacityThresholds.MaxMemoryUtilizationPercent {
return corev1.ConditionFalse
}
}
return corev1.ConditionTrue
}

return toolchainv1alpha1.Condition{
func updateReadyCondition(spc *toolchainv1alpha1.SpaceProvisionerConfig, status corev1.ConditionStatus, reason, message string) {
readyCondition := toolchainv1alpha1.Condition{
Type: toolchainv1alpha1.ConditionReady,
Status: toolchainPresent,
Message: toolchainPresenceMessage,
Reason: toolchainPresenceReason,
}, reportedError
Status: status,
Reason: reason,
Message: message,
}
spc.Status.Conditions, _ = condition.AddOrUpdateStatusConditions(spc.Status.Conditions, readyCondition)
}
Loading
Loading