Skip to content

Commit

Permalink
fix: avoid concurrent updates on the designed primary (cloudnative-pg…
Browse files Browse the repository at this point in the history
…#5755)

This patch fixes a condition where a demoted replica cluster is stuck
waiting for the primary to be demoted to a designed primary, due to a
concurrent update on the Cluster CR by the operator and the instance
manager.

The update by the instance manager is now using the update method
instead of a patch, relying on the Kubernetes optimistic locking mechanism.

Closes: cloudnative-pg#5754 

Signed-off-by: Leonardo Cecchi <[email protected]>
Co-authored-by: Armando Ruocco <[email protected]>
  • Loading branch information
leonardoce and armru authored Oct 9, 2024
1 parent 9c3105f commit 0f82afc
Showing 1 changed file with 20 additions and 7 deletions.
27 changes: 20 additions & 7 deletions internal/management/controller/instance_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
Expand Down Expand Up @@ -1281,13 +1282,25 @@ func (r *InstanceReconciler) reconcileDesignatedPrimary(
// I'm the primary, need to inform the operator
log.FromContext(ctx).Info("Setting myself as the current designated primary")

oldCluster := cluster.DeepCopy()
cluster.Status.CurrentPrimary = r.instance.GetPodName()
cluster.Status.CurrentPrimaryTimestamp = pgTime.GetCurrentTimestamp()
if r.instance.RequiresDesignatedPrimaryTransition {
externalcluster.SetDesignatedPrimaryTransitionCompleted(cluster)
}
return changed, r.client.Status().Patch(ctx, cluster, client.MergeFrom(oldCluster))
return changed, retry.RetryOnConflict(retry.DefaultBackoff, func() error {
var livingCluster apiv1.Cluster

err := r.client.Get(ctx, client.ObjectKeyFromObject(cluster), &livingCluster)
if err != nil {
return err
}

updatedCluster := livingCluster.DeepCopy()
updatedCluster.Status.CurrentPrimary = r.instance.GetPodName()
updatedCluster.Status.CurrentPrimaryTimestamp = pgTime.GetCurrentTimestamp()
if r.instance.RequiresDesignatedPrimaryTransition {
externalcluster.SetDesignatedPrimaryTransitionCompleted(updatedCluster)
}

cluster.Status = updatedCluster.Status

return r.client.Status().Update(ctx, updatedCluster)
})
}

// waitForWalReceiverDown wait until the wal receiver is down, and it's used
Expand Down

0 comments on commit 0f82afc

Please sign in to comment.