From 7c781f04a8c07df63d803ce9e89dee9ba063b061 Mon Sep 17 00:00:00 2001
From: David <djoshy@redhat.com>
Date: Thu, 25 Apr 2024 15:43:10 -0400
Subject: [PATCH 1/2] docs: add NodeDisruptionPolicy

---
 docs/FAQ.md                                   |   2 +-
 ...chineConfiguration.md => MachineConfig.md} |   0
 docs/MachineConfigController.md               |   2 +-
 docs/NodeDisruptionPolicy.md                  | 155 ++++++++++++++++++
 docs/README.md                                |   2 +-
 5 files changed, 158 insertions(+), 3 deletions(-)
 rename docs/{MachineConfiguration.md => MachineConfig.md} (100%)
 create mode 100644 docs/NodeDisruptionPolicy.md

diff --git a/docs/FAQ.md b/docs/FAQ.md
index 49b22ee2bc..05d18f6656 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -73,7 +73,7 @@ of kubelet.
 
 ## Q: Can I use the MCO to re-partition or re-install?
 
-Not today.  The [MachineConfig](MachineConfiguration.md) doc discusses which sections
+Not today.  The [MachineConfig](MachineConfig.md) doc discusses which sections
 of the rendered Ignition can be changed, and that does not include e.g. the Ignition
 `storage` section.  For example, you cannot currently switch an existing worker
 node to be encrypted or use RAID after the fact - you must re-provision the system.
diff --git a/docs/MachineConfiguration.md b/docs/MachineConfig.md
similarity index 100%
rename from docs/MachineConfiguration.md
rename to docs/MachineConfig.md
diff --git a/docs/MachineConfigController.md b/docs/MachineConfigController.md
index fc57938e0b..258f7f2e31 100644
--- a/docs/MachineConfigController.md
+++ b/docs/MachineConfigController.md
@@ -116,7 +116,7 @@ Use kubernetes Deployment behavior for LabelSelector to find Pods.
 
 ### Generating desired MachineConfig
 
-Use the merging behavior defined in MachineConfig design document [here](./MachineConfiguration.md#how-to-create-generated-machineconfig) to create a single MachineConfig from all the MachineConfig objects that were selected above.
+Use the merging behavior defined in MachineConfig design document [here](./MachineConfig.md#how-to-create-generated-machineconfig) to create a single MachineConfig from all the MachineConfig objects that were selected above.
 
 #### Ordering the MachineConfigs
 
diff --git a/docs/NodeDisruptionPolicy.md b/docs/NodeDisruptionPolicy.md
new file mode 100644
index 0000000000..651a1451f5
--- /dev/null
+++ b/docs/NodeDisruptionPolicy.md
@@ -0,0 +1,155 @@
+# NodeDisruptionPolicy
+
+Node Disruption Policy allows the user to define to what actions to take for minor MachineConfig updates. The MCO is introducing this feature in TechPreview for 4.16. More details about the design can be found in the [enhancement](https://github.com/openshift/enhancements/pull/1525).
+
+## Goals
+
+* Have users be able to define no-action and service reloads to specific MachineConfig changes
+* Have users be able to easily see existing cluster non-disruptive update cases
+
+## Non-Goals
+
+* Have NodeDisruptionPolicy apply to non-MCO driven changes (e.g. SRIOV can still reboot nodes)
+* Remove existing non-disruptive update paths (the user will be able to override cluster defaults)
+* Design for image-based updates (live apply and bootc, will be considered in the future)
+* Have the MCO validate whether a change can be successfully applied with the given NodeDisruptionPolicy (i.e. it is up to the responsibility of the user to ensure the correctness of their defined actions)
+
+## How it works right now
+
+The current cluster policies are always listed in NodeDisruptionPolicyStatus in a `MachineConfiguration`(not to be confused with `MachineConfig`) CR, called "cluster". All configurations should be applied to the `spec.nodeDisruptionPolicy` field of this CR. The operator will merge the user defined policies and the cluster defaults and then display them in the `status.nodeDisruptionPolicyStatus` field.
+
+**Note: All configuration must be done to the CR named "cluster". Any `MachineConfiguration` object made with another name will be rejected via a ValidatingAdmissionPolicy. This will be the single point of control for all Node Disruption Policies.**
+
+For example, when there is no user defined policies, the status will just show the cluster defaults. See below:
+
+```console
+$ oc get MachineConfiguration/cluster -o yaml
+apiVersion: operator.openshift.io/v1
+kind: MachineConfiguration
+metadata:
+  creationTimestamp: "2024-04-16T15:02:37Z"
+  generation: 4
+  name: cluster
+  resourceVersion: "261205"
+  uid: 2c67b155-1898-452f-adbd-ed376afc0ea2
+spec:
+  logLevel: Normal
+  managementState: Managed
+  operatorLogLevel: Normal
+status:
+  nodeDisruptionPolicyStatus:
+    clusterPolicies:
+      files:
+      - actions:
+        - type: None
+        path: /etc/mco/internal-registry-pull-secret.json
+      - actions:
+        - type: None
+        path: /var/lib/kubelet/config.json
+      - actions:
+        - reload:
+            serviceName: crio.service
+          type: Reload
+        path: /etc/machine-config-daemon/no-reboot/containers-gpg.pub
+      - actions:
+        - reload:
+            serviceName: crio.service
+          type: Reload
+        path: /etc/containers/policy.json
+      - actions:
+        - type: Special
+        path: /etc/containers/registries.conf
+      sshkey:
+        actions:
+        - type: None
+  readyReplicas: 0
+```
+Say, for instance the user applied the following MachineConfiguration:
+```
+apiVersion: operator.openshift.io/v1
+kind: MachineConfiguration
+metadata:
+  name: cluster
+  namespace: openshift-machine-config-operator
+spec:
+  nodeDisruptionPolicy:
+    files:
+      - path: /etc/my-file
+        actions:
+          - type: None
+```
+Now, the Status will be updated to reflect the merged policy:
+```
+$ oc get MachineConfiguration/cluster -o yaml
+apiVersion: operator.openshift.io/v1
+kind: MachineConfiguration
+metadata:
+  creationTimestamp: "2024-04-16T15:02:37Z"
+  generation: 4
+  name: cluster
+  resourceVersion: "261205"
+  uid: 2c67b155-1898-452f-adbd-ed376afc0ea2
+spec:
+  nodeDisruptionPolicy:
+    files:
+      - path: /etc/my-file
+        actions:
+          - type: None
+  logLevel: Normal
+  managementState: Managed
+  operatorLogLevel: Normal
+status:
+  nodeDisruptionPolicyStatus:
+    clusterPolicies:
+      files:
+      - actions:
+        - type: None
+        path: /etc/my-file
+      - actions:
+        - type: None
+        path: /etc/mco/internal-registry-pull-secret.json
+      - actions:
+        - type: None
+        path: /var/lib/kubelet/config.json
+      - actions:
+        - reload:
+            serviceName: crio.service
+          type: Reload
+        path: /etc/machine-config-daemon/no-reboot/containers-gpg.pub
+      - actions:
+        - reload:
+            serviceName: crio.service
+          type: Reload
+        path: /etc/containers/policy.json
+      - actions:
+        - type: Special
+        path: /etc/containers/registries.conf
+      sshkey:
+        actions:
+        - type: None
+  readyReplicas: 0
+
+```
+
+
+For this initial implementation the policy supports MachineConfig changes to the following:
+- Files
+- Units
+- sshKeys
+
+The following actions are supported:
+- `None`: No action will be done for this MachineConfig change.
+- `Drain`: This will drain the node of its current workload.
+- `Reload`: Reloads a user specified service. This requires specifying the service to be reloaded in the `reload.serviceName` field.
+- `Restart`: Restarts a user specified service. This requires specifying the service to be restarted in the `restart.serviceName` field
+- `DaemonReload`: This executes a daemon-reload, which reloads the systemd manager configuration.
+- `Reboot`: This will reboot the node.
+- `Special`: This is an internal MCO only action and cannot be set by the user.
+
+## Some key points to note
+
+- The default action for an unspecified change is reboot.
+- If there is a conflict between a user defined policy and the cluster default, the user defined policy will override the cluster default.
+- If any of the changes result in a reboot action, all other policies will be ignored.
+- There is no dedup of the final actions list. It is possible an action may be repeated if multiple policies are in effect for MachineConfig change.
+- It is important to remember that the cluster node disruption policy (as defined by the status of `MachineConfiguration/cluster`) applies to the difference between currentConfig and desiredConfig. If a file/service is added, updated or removed by a new MachineConfig change, then the node disruption policy for that respective file/service will be in effect. If no policy is defined for this change, it will result in a Reboot action.
\ No newline at end of file
diff --git a/docs/README.md b/docs/README.md
index aa9fb5c450..c10f197045 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -68,7 +68,7 @@ for more information.
 # What to look at after creating a MachineConfig
 
 Once you create a MachineConfig fragment like the above, the controller will generate a new "rendered" version that will be used as a target.
-For more information, see [MachineConfiguration](/docs/MachineConfiguration.md).
+For more information, see [MachineConfig](/docs/MachineConfig.md).
 
 In particular, you should look at `oc describe machineconfigpool` and `oc describe clusteroperator/machine-config` as noted above.
 

From 212846ea9aa22178a16207875c4a889bfa7b84fe Mon Sep 17 00:00:00 2001
From: David <djoshy@redhat.com>
Date: Thu, 25 Apr 2024 15:45:14 -0400
Subject: [PATCH 2/2] daemon: minor nit fixes

---
 pkg/daemon/update.go | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go
index 403542ba01..cccd3dd3b4 100644
--- a/pkg/daemon/update.go
+++ b/pkg/daemon/update.go
@@ -154,9 +154,14 @@ func (dn *Daemon) executeReloadServiceNodeDisruptionAction(serviceName string, r
 // If at any point an error occurs, we reboot the node so that node has correct configuration.
 func (dn *Daemon) performPostConfigChangeNodeDisruptionAction(postConfigChangeActions []opv1.NodeDisruptionPolicyStatusAction, configName string) error {
 
-	logSystem("Executing performPostConfigChangeNodeDisruptionAction(drain already complete/skipped) for config %s", configName)
 	for _, action := range postConfigChangeActions {
-		logSystem("Executing postconfig action: %v for config %s", action.Type, configName)
+
+		// Drain is already completed at this stage and essentially a no-op for this loop, so no need to log that.
+		if action.Type == opv1.DrainStatusAction {
+			continue
+		}
+
+		logSystem("Performing post config change action: %v for config %s", action.Type, configName)
 		if action.Type == opv1.RebootStatusAction {
 			err := upgrademonitor.GenerateAndApplyMachineConfigNodes(
 				&upgrademonitor.Condition{State: mcfgalphav1.MachineConfigNodeUpdatePostActionComplete, Reason: string(mcfgalphav1.MachineConfigNodeUpdateRebooted), Message: fmt.Sprintf("Node will reboot into config %s", configName)},
@@ -608,7 +613,7 @@ func calculatePostConfigChangeNodeDisruptionActionFromMCDiffs(diffSSH bool, diff
 		for _, policyFile := range clusterPolicies.Files {
 			klog.V(4).Infof("comparing policy path %s to diff path %s", policyFile.Path, diffPath)
 			if policyFile.Path == diffPath {
-				klog.Infof("NodeDisruptionPolicy found for diff file %s!", diffPath)
+				klog.Infof("NodeDisruptionPolicy found for diff file %s", diffPath)
 				actions = append(actions, policyFile.Actions...)
 				pathFound = true
 				break