Skip to content

Commit

Permalink
KUBESAW-187: Adjust ksctl adm restart command to use rollout-restart (#…
Browse files Browse the repository at this point in the history
…79)

* KUBESAW-187: Adjust ksctl adm restart command to use rollout-restart

Signed-off-by: Feny Mehta <[email protected]>

* some checking

Signed-off-by: Feny Mehta <[email protected]>

* golint

Signed-off-by: Feny Mehta <[email protected]>

* few changes to the logic

Signed-off-by: Feny Mehta <[email protected]>

* t cases

Signed-off-by: Feny Mehta <[email protected]>

* eview comments

Signed-off-by: Feny Mehta <[email protected]>

* Review comments

Signed-off-by: Feny Mehta <[email protected]>

* check the args

Signed-off-by: Feny Mehta <[email protected]>

* adding unit test cases

Signed-off-by: Feny Mehta <[email protected]>

* Change in test cases

Signed-off-by: Feny Mehta <[email protected]>

* minor change in unit test

Signed-off-by: Feny Mehta <[email protected]>

* unregister-member test

Signed-off-by: Feny Mehta <[email protected]>

* unit test case for restart

Signed-off-by: Feny Mehta <[email protected]>

* test case for delete

Signed-off-by: Feny Mehta <[email protected]>

* Rc1

Signed-off-by: Feny Mehta <[email protected]>

* golint

Signed-off-by: Feny Mehta <[email protected]>

* changes to the logic of restart

Signed-off-by: Feny Mehta <[email protected]>

* review comments-2

Signed-off-by: Feny Mehta <[email protected]>

* restart-test changes

Signed-off-by: Feny Mehta <[email protected]>

* CI

Signed-off-by: Feny Mehta <[email protected]>

* golang ci

Signed-off-by: Feny Mehta <[email protected]>

* adding tc

Signed-off-by: Feny Mehta <[email protected]>

* some addition to test cases

Signed-off-by: Feny Mehta <[email protected]>

* some changes

Signed-off-by: Feny Mehta <[email protected]>

* adding some comments

Signed-off-by: Feny Mehta <[email protected]>

* autoscalling buffer test case

Signed-off-by: Feny Mehta <[email protected]>

* Modification of test  cases

Signed-off-by: Feny Mehta <[email protected]>

* Go lint

Signed-off-by: Feny Mehta <[email protected]>

* Test case of status

Signed-off-by: Feny Mehta <[email protected]>

* Linter

Signed-off-by: Feny Mehta <[email protected]>

* test of unregister_member

Signed-off-by: Feny Mehta <[email protected]>

* phase-3 rc

Signed-off-by: Feny Mehta <[email protected]>

* code cov

Signed-off-by: Feny Mehta <[email protected]>

* some changes to status func

Signed-off-by: Feny Mehta <[email protected]>

* leftovers

Signed-off-by: Feny Mehta <[email protected]>

* merge conflict

Signed-off-by: Feny Mehta <[email protected]>

* some changes as per rc

Signed-off-by: Feny Mehta <[email protected]>

* go version fix

Signed-off-by: Feny Mehta <[email protected]>

* extra left overs

Signed-off-by: Feny Mehta <[email protected]>

* linter

Signed-off-by: Feny Mehta <[email protected]>

---------

Signed-off-by: Feny Mehta <[email protected]>
Co-authored-by: Matous Jobanek <[email protected]>
Co-authored-by: Francisc Munteanu <[email protected]>
  • Loading branch information
3 people authored Nov 27, 2024
1 parent 5a14398 commit a89cb52
Show file tree
Hide file tree
Showing 9 changed files with 553 additions and 306 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
github.com/lithammer/dedent v1.1.0 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-isatty v0.0.18 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,8 @@ github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+
github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII=
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0=
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE=
github.com/lithammer/dedent v1.1.0 h1:VNzHMVCBNG1j0fh3OrsFRkVUwStdDArbgBWoPAffktY=
github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc=
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
Expand Down
10 changes: 0 additions & 10 deletions pkg/cmd/adm/register_member_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (
"github.com/kubesaw/ksctl/pkg/utils"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -515,15 +514,6 @@ func verifyToolchainClusterSecret(t *testing.T, fakeClient *test.FakeClient, saN
require.Equal(t, fmt.Sprintf("token-secret-for-%s", saName), apiConfig.AuthInfos["auth"].Token)
}

func whenDeploymentThenUpdated(t *testing.T, fakeClient *test.FakeClient, namespacedName types.NamespacedName, currentReplicas int32, numberOfUpdateCalls *int) func(ctx context.Context, obj runtimeclient.Object, opts ...runtimeclient.UpdateOption) error {
return func(ctx context.Context, obj runtimeclient.Object, opts ...runtimeclient.UpdateOption) error {
if deployment, ok := obj.(*appsv1.Deployment); ok {
checkDeploymentBeingUpdated(t, fakeClient, namespacedName, currentReplicas, numberOfUpdateCalls, deployment)
}
return fakeClient.Client.Update(ctx, obj, opts...)
}
}

func newFakeClientsFromRestConfig(t *testing.T, initObjs ...runtimeclient.Object) (newClientFromRestConfigFunc, *test.FakeClient) {
fakeClient := test.NewFakeClient(t, initObjs...)
fakeClient.MockCreate = func(ctx context.Context, obj runtimeclient.Object, opts ...runtimeclient.CreateOption) error {
Expand Down
258 changes: 164 additions & 94 deletions pkg/cmd/adm/restart.go
Original file line number Diff line number Diff line change
@@ -1,157 +1,227 @@
package adm

import (
"context"
"fmt"
"os"
"time"

"github.com/kubesaw/ksctl/pkg/client"
"github.com/kubesaw/ksctl/pkg/cmd/flags"
"github.com/kubesaw/ksctl/pkg/configuration"
clicontext "github.com/kubesaw/ksctl/pkg/context"
"github.com/kubesaw/ksctl/pkg/ioutils"

"github.com/spf13/cobra"
appsv1 "k8s.io/api/apps/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/cli-runtime/pkg/genericclioptions"
"k8s.io/cli-runtime/pkg/genericiooptions"
kubectlrollout "k8s.io/kubectl/pkg/cmd/rollout"
cmdutil "k8s.io/kubectl/pkg/cmd/util"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
)

type (
RolloutRestartFunc func(ctx *clicontext.CommandContext, deployment appsv1.Deployment) error
RolloutStatusCheckerFunc func(ctx *clicontext.CommandContext, deployment appsv1.Deployment) error
)

// NewRestartCmd() is a function to restart the whole operator, it relies on the target cluster and fetches the cluster config
// 1. If the command is run for host operator, it restart the whole host operator.(it deletes olm based pods(host-operator pods),
// waits for the new pods to come up, then uses rollout-restart command for non-olm based - registration-service)
// 2. If the command is run for member operator, it restart the whole member operator.(it deletes olm based pods(member-operator pods),
// waits for the new pods to come up, then uses rollout-restart command for non-olm based deployments - webhooks)
func NewRestartCmd() *cobra.Command {
var targetCluster string
command := &cobra.Command{
Use: "restart -t <cluster-name> <deployment-name>",
Short: "Restarts a deployment",
Long: `Restarts the deployment with the given name in the operator namespace.
If no deployment name is provided, then it lists all existing deployments in the namespace.`,
Args: cobra.RangeArgs(0, 1),
Use: "restart <cluster-name>",
Short: "Restarts an operator",
Long: `Restarts the whole operator, it relies on the target cluster and fetches the cluster config
1. If the command is run for host operator, it restarts the whole host operator.
(it deletes olm based pods(host-operator pods),waits for the new pods to
come up, then uses rollout-restart command for non-olm based deployments - registration-service)
2. If the command is run for member operator, it restarts the whole member operator.
(it deletes olm based pods(member-operator pods),waits for the new pods
to come up, then uses rollout-restart command for non-olm based deployments - webhooks)`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
term := ioutils.NewTerminal(cmd.InOrStdin, cmd.OutOrStdout)
ctx := clicontext.NewCommandContext(term, client.DefaultNewClient)
return restart(ctx, targetCluster, args...)
return restart(ctx, args[0])
},
}
command.Flags().StringVarP(&targetCluster, "target-cluster", "t", "", "The target cluster")
flags.MustMarkRequired(command, "target-cluster")
return command
}

func restart(ctx *clicontext.CommandContext, clusterName string, deployments ...string) error {
func restart(ctx *clicontext.CommandContext, clusterName string) error {
kubeConfigFlags := genericclioptions.NewConfigFlags(true).WithDeprecatedPasswordFlag()
ioStreams := genericiooptions.IOStreams{
In: os.Stdin,
Out: os.Stdout,
ErrOut: os.Stderr,
}
kubeConfigFlags.ClusterName = nil // `cluster` flag is redefined for our own purpose
kubeConfigFlags.AuthInfoName = nil // unused here, so we can hide it
kubeConfigFlags.Context = nil // unused here, so we can hide it

cfg, err := configuration.LoadClusterConfig(ctx, clusterName)
if err != nil {
return err
}
cl, err := ctx.NewClient(cfg.Token, cfg.ServerAPI)
kubeConfigFlags.Namespace = &cfg.OperatorNamespace
kubeConfigFlags.APIServer = &cfg.ServerAPI
kubeConfigFlags.BearerToken = &cfg.Token
kubeconfig, err := client.EnsureKsctlConfigFile()
if err != nil {
return err
}

if len(deployments) == 0 {
err := printExistingDeployments(ctx.Terminal, cl, cfg.OperatorNamespace)
if err != nil {
ctx.Terminal.Printlnf("\nERROR: Failed to list existing deployments\n :%s", err.Error())
}
return fmt.Errorf("at least one deployment name is required, include one or more of the above deployments to restart")
}
deploymentName := deployments[0]
kubeConfigFlags.KubeConfig = &kubeconfig
factory := cmdutil.NewFactory(cmdutil.NewMatchVersionFlags(kubeConfigFlags))

if !ctx.AskForConfirmation(
ioutils.WithMessagef("restart the deployment '%s' in namespace '%s'", deploymentName, cfg.OperatorNamespace)) {
ioutils.WithMessagef("restart all the deployments in the cluster '%s' and namespace '%s' \n", clusterName, cfg.OperatorNamespace)) {
return nil
}
return restartDeployment(ctx, cl, cfg.OperatorNamespace, deploymentName)
}

func restartDeployment(ctx *clicontext.CommandContext, cl runtimeclient.Client, ns string, deploymentName string) error {
namespacedName := types.NamespacedName{
Namespace: ns,
Name: deploymentName,
cl, err := ctx.NewClient(cfg.Token, cfg.ServerAPI)
if err != nil {
return err
}

originalReplicas, err := scaleToZero(cl, namespacedName)
return restartDeployments(ctx, cl, cfg.OperatorNamespace, func(ctx *clicontext.CommandContext, deployment appsv1.Deployment) error {
return checkRolloutStatus(ctx, factory, ioStreams, deployment)
}, func(ctx *clicontext.CommandContext, deployment appsv1.Deployment) error {
return restartNonOlmDeployments(ctx, deployment, factory, ioStreams)
})
}

// This function has the whole logic of getting the list of olm and non-olm based deployment, then proceed on restarting/deleting accordingly
func restartDeployments(ctx *clicontext.CommandContext, cl runtimeclient.Client, ns string, checker RolloutStatusCheckerFunc, restarter RolloutRestartFunc) error {

ctx.Printlnf("Fetching the current OLM and non-OLM deployments of the operator in %s namespace", ns)
olmDeploymentList, nonOlmDeploymentList, err := getExistingDeployments(ctx, cl, ns)
if err != nil {
if apierrors.IsNotFound(err) {
ctx.Printlnf("\nERROR: The given deployment '%s' wasn't found.", deploymentName)
return printExistingDeployments(ctx, cl, ns)
}
return err
}
ctx.Println("The deployment was scaled to 0")
if err := scaleBack(ctx, cl, namespacedName, originalReplicas); err != nil {
ctx.Printlnf("Scaling the deployment '%s' in namespace '%s' back to '%d' replicas wasn't successful", originalReplicas)
ctx.Println("Please, try to contact administrators to scale the deployment back manually")
return err
//if there is no olm operator deployment, no need for restart
if len(olmDeploymentList.Items) == 0 {
return fmt.Errorf("no operator deployment found in namespace %s , it is required for the operator deployment to be running so the command can proceed with restarting the KubeSaw components", ns)
}
//Deleting the pods of the olm based operator deployment and then checking the status
for _, olmOperatorDeployment := range olmDeploymentList.Items {
ctx.Printlnf("Proceeding to delete the Pods of %v", olmOperatorDeployment.Name)

if err := deleteDeploymentPods(ctx, cl, olmOperatorDeployment); err != nil {
return err
}
//sleeping here so that when the status is called we get the correct status
time.Sleep(1 * time.Second)

ctx.Printlnf("Checking the status of the deleted pod's deployment %v", olmOperatorDeployment.Name)
//check the rollout status
if err := checker(ctx, olmOperatorDeployment); err != nil {
return err
}
}

//Non-Olm deployments like reg-svc,to be restarted
//if no Non-OL deployment found it should just return with a message
if len(nonOlmDeploymentList.Items) == 0 {
// if there are no non-olm deployments
ctx.Printlnf("No Non-OLM deployment found in namespace %s, hence no restart happened", ns)
return nil
}
// if there is a Non-olm deployment found use rollout-restart command
for _, nonOlmDeployment := range nonOlmDeploymentList.Items {
//it should only use rollout restart for the deployments which are NOT autoscaling-buffer
if nonOlmDeployment.Name != "autoscaling-buffer" {
ctx.Printlnf("Proceeding to restart the non-olm deployment %v", nonOlmDeployment.Name)
//using rollout-restart
if err := restarter(ctx, nonOlmDeployment); err != nil {
return err
}
//check the rollout status
ctx.Printlnf("Checking the status of the rolled out deployment %v", nonOlmDeployment.Name)
if err := checker(ctx, nonOlmDeployment); err != nil {
return err
}
//if the deployment is not auto-scaling buffer, it should return from the function and not go to print the message for autoscaling buffer
//We do not expect more than 1 non-olm deployment for each OLM deployment and hence returning here
return nil
}
//message if there is a autoscaling buffer, it shouldn't be restarted but successfully exit
ctx.Printlnf("Found only autoscaling-buffer deployment in namespace %s , which is not required to be restarted", ns)
}

ctx.Printlnf("The deployment was scaled back to '%d'", originalReplicas)
return nil
}

func restartHostOperator(ctx *clicontext.CommandContext, hostClient runtimeclient.Client, hostNamespace string) error {
deployments := &appsv1.DeploymentList{}
if err := hostClient.List(context.TODO(), deployments,
runtimeclient.InNamespace(hostNamespace),
runtimeclient.MatchingLabels{"olm.owner.namespace": "toolchain-host-operator"}); err != nil {
func deleteDeploymentPods(ctx *clicontext.CommandContext, cl runtimeclient.Client, deployment appsv1.Deployment) error {
//get pods by label selector from the deployment
pods := corev1.PodList{}
selector, _ := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
if err := cl.List(ctx, &pods,
runtimeclient.MatchingLabelsSelector{Selector: selector},
runtimeclient.InNamespace(deployment.Namespace)); err != nil {
return err
}
if len(deployments.Items) != 1 {
return fmt.Errorf("there should be a single deployment matching the label olm.owner.namespace=toolchain-host-operator in %s ns, but %d was found. "+
"It's not possible to restart the Host Operator deployment", hostNamespace, len(deployments.Items))

//delete pods
for _, pod := range pods.Items {
pod := pod // TODO We won't need it after upgrading to go 1.22: https://go.dev/blog/loopvar-preview
ctx.Printlnf("Deleting pod: %s", pod.Name)
if err := cl.Delete(ctx, &pod); err != nil {
return err
}
}

return restartDeployment(ctx, hostClient, hostNamespace, deployments.Items[0].Name)
return nil

}

func printExistingDeployments(term ioutils.Terminal, cl runtimeclient.Client, ns string) error {
deployments := &appsv1.DeploymentList{}
if err := cl.List(context.TODO(), deployments, runtimeclient.InNamespace(ns)); err != nil {
func restartNonOlmDeployments(ctx *clicontext.CommandContext, deployment appsv1.Deployment, f cmdutil.Factory, ioStreams genericclioptions.IOStreams) error {

o := kubectlrollout.NewRolloutRestartOptions(ioStreams)

if err := o.Complete(f, nil, []string{"deployment/" + deployment.Name}); err != nil {
return err
}
deploymentList := "\n"
for _, deployment := range deployments.Items {
deploymentList += fmt.Sprintf("%s\n", deployment.Name)

if err := o.Validate(); err != nil {
return err
}
term.PrintContextSeparatorWithBodyf(deploymentList, "Existing deployments in %s namespace", ns)
return nil
ctx.Printlnf("Running the rollout restart command for non-Olm deployment %v", deployment.Name)
return o.RunRestart()
}

func scaleToZero(cl runtimeclient.Client, namespacedName types.NamespacedName) (int32, error) {
// get the deployment
deployment := &appsv1.Deployment{}
if err := cl.Get(context.TODO(), namespacedName, deployment); err != nil {
return 0, err
func checkRolloutStatus(ctx *clicontext.CommandContext, f cmdutil.Factory, ioStreams genericclioptions.IOStreams, deployment appsv1.Deployment) error {

cmd := kubectlrollout.NewRolloutStatusOptions(ioStreams)

if err := cmd.Complete(f, []string{"deployment/" + deployment.Name}); err != nil {
return err
}
// keep original number of replicas so we can bring it back
originalReplicas := *deployment.Spec.Replicas
zero := int32(0)
deployment.Spec.Replicas = &zero

// update the deployment so it scales to zero
return originalReplicas, cl.Update(context.TODO(), deployment)
if err := cmd.Validate(); err != nil {
return err
}
ctx.Printlnf("Running the Rollout status to check the status of the deployment")
return cmd.Run()
}

func scaleBack(term ioutils.Terminal, cl runtimeclient.Client, namespacedName types.NamespacedName, originalReplicas int32) error {
return wait.PollUntilContextTimeout(context.TODO(), 500*time.Millisecond, 10*time.Second, false, func(ctx context.Context) (done bool, err error) {
term.Println("")
term.Printlnf("Trying to scale the deployment back to '%d'", originalReplicas)
// get the updated
deployment := &appsv1.Deployment{}
if err := cl.Get(context.TODO(), namespacedName, deployment); err != nil {
return false, err
}
// check if the replicas number wasn't already reset by a controller
if *deployment.Spec.Replicas == originalReplicas {
return true, nil
}
// set the original
deployment.Spec.Replicas = &originalReplicas
// and update to scale back
if err := cl.Update(context.TODO(), deployment); err != nil {
term.Printlnf("error updating Deployment '%s': %s. Will retry again...", namespacedName.Name, err.Error())
return false, nil
}
return true, nil
})
func getExistingDeployments(ctx *clicontext.CommandContext, cl runtimeclient.Client, ns string) (*appsv1.DeploymentList, *appsv1.DeploymentList, error) {

olmDeployments := &appsv1.DeploymentList{}
if err := cl.List(ctx, olmDeployments,
runtimeclient.InNamespace(ns),
runtimeclient.MatchingLabels{"kubesaw-control-plane": "kubesaw-controller-manager"}); err != nil {
return nil, nil, err
}

nonOlmDeployments := &appsv1.DeploymentList{}
if err := cl.List(ctx, nonOlmDeployments,
runtimeclient.InNamespace(ns),
runtimeclient.MatchingLabels{"toolchain.dev.openshift.com/provider": "codeready-toolchain"}); err != nil {
return nil, nil, err
}

return olmDeployments, nonOlmDeployments, nil
}
Loading

0 comments on commit a89cb52

Please sign in to comment.