From c48947f140890bab9eeab43026df9b156e638b31 Mon Sep 17 00:00:00 2001 From: Danil Grigorev Date: Sat, 7 Sep 2024 09:33:34 +0200 Subject: [PATCH] Implement Planner for system-agent integration (#727) Signed-off-by: Danil-Grigorev --- ...er-turtles-exp-etcdrestore-components.yaml | 3 + .../api/v1alpha1/etcdmachinesnapshot_types.go | 1 + ...s-capi.cattle.io_etcdmachinesnapshots.yaml | 3 + exp/etcdrestore/controllers/planner.go | 266 ++++++++++++++++++ 4 files changed, 273 insertions(+) create mode 100644 exp/etcdrestore/controllers/planner.go diff --git a/charts/rancher-turtles/templates/rancher-turtles-exp-etcdrestore-components.yaml b/charts/rancher-turtles/templates/rancher-turtles-exp-etcdrestore-components.yaml index 0e828c1b..14605b91 100644 --- a/charts/rancher-turtles/templates/rancher-turtles-exp-etcdrestore-components.yaml +++ b/charts/rancher-turtles/templates/rancher-turtles-exp-etcdrestore-components.yaml @@ -58,6 +58,8 @@ spec: type: string type: object x-kubernetes-map-type: atomic + location: + type: string machineName: type: string manual: @@ -65,6 +67,7 @@ spec: required: - clusterName - configRef + - location - machineName - manual type: object diff --git a/exp/etcdrestore/api/v1alpha1/etcdmachinesnapshot_types.go b/exp/etcdrestore/api/v1alpha1/etcdmachinesnapshot_types.go index e03c8eae..4eea1d7e 100644 --- a/exp/etcdrestore/api/v1alpha1/etcdmachinesnapshot_types.go +++ b/exp/etcdrestore/api/v1alpha1/etcdmachinesnapshot_types.go @@ -46,6 +46,7 @@ type EtcdMachineSnapshotSpec struct { MachineName string `json:"machineName"` ConfigRef corev1.LocalObjectReference `json:"configRef"` Manual bool `json:"manual"` + Location string `json:"location"` } // EtcdSnapshotRestoreStatus defines observed state of EtcdSnapshotRestore diff --git a/exp/etcdrestore/config/crd/bases/turtles-capi.cattle.io_etcdmachinesnapshots.yaml b/exp/etcdrestore/config/crd/bases/turtles-capi.cattle.io_etcdmachinesnapshots.yaml index e7de0d1b..afb14b16 100644 --- a/exp/etcdrestore/config/crd/bases/turtles-capi.cattle.io_etcdmachinesnapshots.yaml +++ b/exp/etcdrestore/config/crd/bases/turtles-capi.cattle.io_etcdmachinesnapshots.yaml @@ -55,6 +55,8 @@ spec: type: string type: object x-kubernetes-map-type: atomic + location: + type: string machineName: type: string manual: @@ -62,6 +64,7 @@ spec: required: - clusterName - configRef + - location - machineName - manual type: object diff --git a/exp/etcdrestore/controllers/planner.go b/exp/etcdrestore/controllers/planner.go new file mode 100644 index 00000000..7c4271de --- /dev/null +++ b/exp/etcdrestore/controllers/planner.go @@ -0,0 +1,266 @@ +/* +Copyright © 2023 - 2024 SUSE LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "bytes" + "compress/gzip" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "strings" + + kerrors "k8s.io/apimachinery/pkg/util/errors" + + bootstrapv1 "github.com/rancher/cluster-api-provider-rke2/bootstrap/api/v1beta1" + snapshotrestorev1 "github.com/rancher/turtles/exp/etcdrestore/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// Planner is responsible for executing instructions on the underlying machine host +// in the specified order, and collecting output from executed steps. +type Planner struct { + client.Client + machine *clusterv1.Machine + secret *corev1.Secret +} + +// Instructions is a one time operation, used to perform shell commands on the host +type Instruction struct { + Name string `json:"name,omitempty"` + Image string `json:"image,omitempty"` + Env []string `json:"env,omitempty"` + Args []string `json:"args,omitempty"` + Command string `json:"command,omitempty"` + SaveOutput bool `json:"saveOutput,omitempty"` +} + +// Instructions is a list of instructions +type Instructions []Instruction + +type plan struct { + Instructions Instructions `json:"instructions,omitempty"` +} + +// Plan is initializing Planner, used to perform instructions in a specific order and collect results +func Plan(ctx context.Context, c client.Client, machine *clusterv1.Machine) *Planner { + return &Planner{ + Client: c, + } +} + +// Apply performs invocation of the supplied set of instructions, and reurns the ongoing state of the +// command execution +func (p *Planner) Apply(ctx context.Context, instructions ...Instruction) (Output, error) { + var err error + errs := []error{} + + data, err := json.Marshal(plan{Instructions: instructions}) + errs = append(errs, err) + + errs = append(errs, p.refresh(ctx)) + errs = append(errs, p.updatePlanSecret(ctx, data)) + errs = append(errs, p.refresh(ctx)) + + output := Output{ + Machine: p.machine, + Finished: p.applied(data, p.secret.Data["applied-checksum"]), + } + + if output.Finished { + output.Result, err = p.Output() + errs = append(errs, err) + } + + return output, kerrors.NewAggregate(errs) +} + +// Output holds results of the command execution +type Output struct { + Machine *clusterv1.Machine + Finished bool + Result map[string][]byte +} + +// Output retuns structured output from the command invocation +func (p *Planner) Output() (map[string][]byte, error) { + reader, err := gzip.NewReader(bytes.NewReader(p.secret.Data["applied-output"])) + if err != nil { + return nil, err + } + defer reader.Close() + + decompressedPlanOutput, err := io.ReadAll(reader) + if err != nil { + return nil, err + } + + outputMap := map[string][]byte{} + if err := json.Unmarshal(decompressedPlanOutput, &outputMap); err != nil { + return nil, fmt.Errorf("failed to unmarshal output: %w", err) + } + + return outputMap, nil +} + +// RKE2KillAll stops RKE2 server or agent on the node +func RKE2KillAll() Instruction { + return Instruction{ + Name: "shutdown", + Command: "/bin/sh", + Args: []string{ + "-c", + "if [ -z $(command -v rke2) ] && [ -z $(command -v rke2-killall.sh) ]; then echo rke2 does not appear to be installed; exit 0; else rke2-killall.sh; fi", + }, + SaveOutput: true, + } +} + +// ETCDRestore performs restore form a snapshot path on the init node +func ETCDRestore(snapshot *snapshotrestorev1.EtcdMachineSnapshot) Instruction { + return Instruction{ + Name: "etcd-restore", + Command: "/bin/sh", + Args: []string{ + "-c", + "rke2 server --cluster-reset", + fmt.Sprintf("--cluster-reset-restore-path=%s", strings.TrimPrefix(snapshot.Spec.Location, "file://")), + }, + SaveOutput: true, + } +} + +// ManifestRemoval cleans up old rke2 manifests on the machine +func ManifestRemoval() Instruction { + return Instruction{ + Name: "remove-server-manifests", + Command: "/bin/sh", + Args: []string{ + "-c", + "rm -rf /var/lib/rancher/rke2/server/manifests/rke2-*.yaml", + }, + SaveOutput: true, + } +} + +// RemoveServerURL deletes previous server url from config, allowing nodes to register using +// new init machine +func RemoveServerURL() Instruction { + return Instruction{ + Name: "remove-server-manifests", + Command: "/bin/sh", + Args: []string{ + "-c", + "sed -i '/^server:/d' /etc/rancher/rke2/config.yaml", + }, + SaveOutput: true, + } +} + +// SetServerURL sets the init machine URL, used to register RKE2 agents +func SetServerURL(serverIP string) Instruction { + return Instruction{ + Name: "replace-server-url", + Command: "/bin/sh", + Args: []string{ + "-c", + fmt.Sprintf("echo 'server: https://%s:9345' >> /etc/rancher/rke2/config.yaml", serverIP), + }, + SaveOutput: true, + } +} + +// RemoveETCDData removes etcd snapshot state form the machine +func RemoveETCDData() Instruction { + return Instruction{ + Name: "remove-etcd-db-dir", + Command: "/bin/sh", + Args: []string{ + "-c", + "rm -rf /var/lib/rancher/rke2/server/db/etcd", + }, + SaveOutput: true, + } +} + +// StartRKE2 start the RKE2 service +func StartRKE2() Instruction { + return Instruction{ + Name: "start-rke2", + Command: "/bin/sh", + Args: []string{ + "-c", + "systemctl start rke2-server.service", + }, + SaveOutput: true, + } +} + +func (p *Planner) applied(plan, appliedChecksum []byte) bool { + result := sha256.Sum256(plan) + planHash := hex.EncodeToString(result[:]) + + return planHash == string(appliedChecksum) +} + +func (p *Planner) updatePlanSecret(ctx context.Context, data []byte) error { + log := log.FromContext(ctx) + + if !bytes.Equal(p.secret.Data["plan"], data) { + log.Info("Plan secret not filled with proper plan", "machine", p.machine.Name) + } + + patchBase := client.MergeFromWithOptions(p.secret.DeepCopy(), client.MergeFromWithOptimisticLock{}) + + p.secret.Data["plan"] = []byte(data) + + if err := p.Client.Patch(ctx, p.secret, patchBase); err != nil { + return fmt.Errorf("failed to patch plan secret: %w", err) + } + + if !bytes.Equal(p.secret.Data["plan"], data) { + log.Info("Patched plan secret with plan", "machine", p.machine.Name) + } + + return nil +} + +func (p *Planner) refresh(ctx context.Context) error { + rke2Config := &bootstrapv1.RKE2Config{} + if err := p.Client.Get(ctx, client.ObjectKey{Namespace: p.machine.Namespace, Name: p.machine.Spec.Bootstrap.ConfigRef.Name}, rke2Config); err != nil { + return fmt.Errorf("failed to get RKE2Config: %w", err) + } + + planSecretName := strings.Join([]string{rke2Config.Name, "rke2config", "plan"}, "-") + + secret := &corev1.Secret{} + if err := p.Client.Get(ctx, client.ObjectKey{Namespace: p.machine.Namespace, Name: planSecretName}, secret); err != nil { + return fmt.Errorf("failed to get plan secret: %w", err) + } + + if secret.Data == nil { + secret.Data = map[string][]byte{} + } + + return nil +}