Delete secondary VRG before primary

Since we added a VRG on the secondary cluster we have a random failure when deleting the DRPC after relocate. When this happens, we find the PVC in terminating state on the secondary cluster, and the VR and VRG are never deleted. This change avoids this issue by deleting the secondary VRG first, and deleting the primary VRG only after the secondary VRG was deleted. When we wait for the deletion of a VRG, we must requeue the request. In a real system we may be able to return success and detect the deletion when a watched resource changes, but in unit tests breaks if we don't requeue the request. The only way to do this with the current code is to return an error. This logs many errors in the log during deletion, like: ERROR Secondary VRG manifestwork deletion in progress very noisy stacktrace... ERROR Primary VRG manifestwork deletion in progress very noisy stacktrace... More work is needed to silence the expected errors. Fixes: RamenDR#1659 Signed-off-by: Nir Soffer <[email protected]>
raghavendra-talur · Dec 11, 2024 · 3409c8f · 3409c8f
1 parent 342d73d
commit 3409c8f
Showing 1 changed file with 39 additions and 7 deletions.
diff --git a/internal/controller/drplacementcontrol_controller.go b/internal/controller/drplacementcontrol_controller.go
@@ -724,15 +724,15 @@ func (r *DRPlacementControlReconciler) cleanupVRGs(
 		return fmt.Errorf("failed to retrieve VRGs. We'll retry later. Error (%w)", err)
 	}
 
-	if !ensureVRGsManagedByDRPC(r.Log, mwu, vrgs, drpc, vrgNamespace) {
-		return fmt.Errorf("VRG adoption in progress")
+	// We have to ensure the secondary VRG is deleted before deleting the primary VRG. This will fail until there
+	// is no secondary VRG in the vrgs list.
+	if err := r.ensureVRGsDeleted(mwu, vrgs, drpc, vrgNamespace, rmn.Secondary); err != nil {
+		return err
 	}
 
-	// delete VRG manifestwork
-	for _, drClusterName := range rmnutil.DRPolicyClusterNames(drPolicy) {
-		if err := mwu.DeleteManifestWork(mwu.BuildManifestWorkName(rmnutil.MWTypeVRG), drClusterName); err != nil {
-			return fmt.Errorf("%w", err)
-		}
+	// This will fail until there is no primary VRG in the vrgs list.
+	if err := r.ensureVRGsDeleted(mwu, vrgs, drpc, vrgNamespace, rmn.Primary); err != nil {
+		return err
 	}
 
 	if len(vrgs) != 0 {
@@ -747,6 +747,38 @@ func (r *DRPlacementControlReconciler) cleanupVRGs(
 	return nil
 }
 
+// ensureVRGsDeleted ensure that secondary or primary VRGs are deleted. Return an error if a vrg could not be deleted,
+// or deletion is in progress. Return nil if vrg of specified type was not found.
+func (r *DRPlacementControlReconciler) ensureVRGsDeleted(
+	mwu rmnutil.MWUtil,
+	vrgs map[string]*rmn.VolumeReplicationGroup,
+	drpc *rmn.DRPlacementControl,
+	vrgNamespace string,
+	replicationState rmn.ReplicationState,
+) error {
+	var inProgress bool
+
+	for cluster, vrg := range vrgs {
+		if vrg.Spec.ReplicationState == replicationState {
+			if !ensureVRGsManagedByDRPC(r.Log, mwu, vrgs, drpc, vrgNamespace) {
+				return fmt.Errorf("%s VRG adoption in progress", replicationState)
+			}
+
+			if err := mwu.DeleteManifestWork(mwu.BuildManifestWorkName(rmnutil.MWTypeVRG), cluster); err != nil {
+				return fmt.Errorf("failed to delete %s VRG manifestwork for cluster %q: %w", replicationState, cluster, err)
+			}
+
+			inProgress = true
+		}
+	}
+
+	if inProgress {
+		return fmt.Errorf("%s VRG manifestwork deletion in progress", replicationState)
+	}
+
+	return nil
+}
+
 func (r *DRPlacementControlReconciler) deleteAllManagedClusterViews(
 	drpc *rmn.DRPlacementControl, clusterNames []string,
 ) error {