Skip to content

Commit

Permalink
Delete secondary VRG before primary
Browse files Browse the repository at this point in the history
Since we added a VRG on the secondary cluster we have a random failure
when deleting the DRPC after relocate. When this happens, we find the
PVC in terminating state on the secondary cluster, and the VR and VRG
are never deleted.

This change avoids this issue by deleting the secondary VRG first, and
deleting the primary VRG only after the secondary VRG was deleted.

When we wait for the deletion of a VRG, we must requeue the request. In
a real system we may be able to return success and detect the deletion
when a watched resource changes, but in unit tests breaks if we don't
requeue the request. The only way to do this with the current code is to
return an error. This logs many errors in the log during deletion, like:

    ERROR   Secondary VRG manifestwork deletion in progress
    very noisy
        stacktrace...

    ERROR Primary VRG manifestwork deletion in progress
    very noisy
        stacktrace...

More work is needed to silence the expected errors.

Fixes: RamenDR#1659
Signed-off-by: Nir Soffer <[email protected]>
  • Loading branch information
nirs authored and BenamarMk committed Dec 11, 2024
1 parent 342d73d commit 3409c8f
Showing 1 changed file with 39 additions and 7 deletions.
46 changes: 39 additions & 7 deletions internal/controller/drplacementcontrol_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,15 +724,15 @@ func (r *DRPlacementControlReconciler) cleanupVRGs(
return fmt.Errorf("failed to retrieve VRGs. We'll retry later. Error (%w)", err)
}

if !ensureVRGsManagedByDRPC(r.Log, mwu, vrgs, drpc, vrgNamespace) {
return fmt.Errorf("VRG adoption in progress")
// We have to ensure the secondary VRG is deleted before deleting the primary VRG. This will fail until there
// is no secondary VRG in the vrgs list.
if err := r.ensureVRGsDeleted(mwu, vrgs, drpc, vrgNamespace, rmn.Secondary); err != nil {
return err
}

// delete VRG manifestwork
for _, drClusterName := range rmnutil.DRPolicyClusterNames(drPolicy) {
if err := mwu.DeleteManifestWork(mwu.BuildManifestWorkName(rmnutil.MWTypeVRG), drClusterName); err != nil {
return fmt.Errorf("%w", err)
}
// This will fail until there is no primary VRG in the vrgs list.
if err := r.ensureVRGsDeleted(mwu, vrgs, drpc, vrgNamespace, rmn.Primary); err != nil {
return err
}

if len(vrgs) != 0 {
Expand All @@ -747,6 +747,38 @@ func (r *DRPlacementControlReconciler) cleanupVRGs(
return nil
}

// ensureVRGsDeleted ensure that secondary or primary VRGs are deleted. Return an error if a vrg could not be deleted,
// or deletion is in progress. Return nil if vrg of specified type was not found.
func (r *DRPlacementControlReconciler) ensureVRGsDeleted(
mwu rmnutil.MWUtil,
vrgs map[string]*rmn.VolumeReplicationGroup,
drpc *rmn.DRPlacementControl,
vrgNamespace string,
replicationState rmn.ReplicationState,
) error {
var inProgress bool

for cluster, vrg := range vrgs {
if vrg.Spec.ReplicationState == replicationState {
if !ensureVRGsManagedByDRPC(r.Log, mwu, vrgs, drpc, vrgNamespace) {
return fmt.Errorf("%s VRG adoption in progress", replicationState)
}

if err := mwu.DeleteManifestWork(mwu.BuildManifestWorkName(rmnutil.MWTypeVRG), cluster); err != nil {
return fmt.Errorf("failed to delete %s VRG manifestwork for cluster %q: %w", replicationState, cluster, err)
}

inProgress = true
}
}

if inProgress {
return fmt.Errorf("%s VRG manifestwork deletion in progress", replicationState)
}

return nil
}

func (r *DRPlacementControlReconciler) deleteAllManagedClusterViews(
drpc *rmn.DRPlacementControl, clusterNames []string,
) error {
Expand Down

0 comments on commit 3409c8f

Please sign in to comment.