Skip to content

Commit

Permalink
fix: [Linux] [NPM] handle #2977 for netpols without cidrs (#2990)
Browse files Browse the repository at this point in the history
* fix: [Linux] [NPM] handle #2977 for netpols without cidrs

Signed-off-by: Hunter Gregory <[email protected]>

* fix: lock and no need to track policy key

Signed-off-by: Hunter Gregory <[email protected]>

* style: remove dead code

Signed-off-by: Hunter Gregory <[email protected]>

---------

Signed-off-by: Hunter Gregory <[email protected]>
  • Loading branch information
huntergregory authored Sep 11, 2024
1 parent 47b4329 commit 60ddd35
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 32 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
vendor/

# Binaries
out/*
output/*
Expand Down
85 changes: 60 additions & 25 deletions npm/pkg/dataplane/dataplane.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"strings"
"sync"
"time"

"github.com/Azure/azure-container-networking/common"
Expand All @@ -18,12 +19,12 @@ import (
const (
reconcileDuration = time.Duration(5 * time.Minute)

contextBackground = "BACKGROUND"
contextApplyDP = "APPLY-DP"
contextAddNetPol = "ADD-NETPOL"
contextAddNetPolBootup = "BOOTUP-ADD-NETPOL"
contextAddNetPolCIDRPrecaution = "ADD-NETPOL-CIDR-PRECAUTION"
contextDelNetPol = "DEL-NETPOL"
contextBackground = "BACKGROUND"
contextApplyDP = "APPLY-DP"
contextAddNetPol = "ADD-NETPOL"
contextAddNetPolBootup = "BOOTUP-ADD-NETPOL"
contextAddNetPolPrecaution = "ADD-NETPOL-PRECAUTION"
contextDelNetPol = "DEL-NETPOL"
)

var (
Expand All @@ -48,6 +49,11 @@ type Config struct {
*policies.PolicyManagerCfg
}

type removePolicyInfo struct {
sync.Mutex
previousRemovePolicyIPSetsFailed bool
}

type DataPlane struct {
*Config
applyInBackground bool
Expand All @@ -64,7 +70,10 @@ type DataPlane struct {
endpointQuery *endpointQuery
applyInfo *applyInfo
netPolQueue *netPolQueue
stopChannel <-chan struct{}
// removePolicyInfo tracks when a policy was removed yet had ApplyIPSet failures.
// This field is only relevant for Linux.
removePolicyInfo removePolicyInfo
stopChannel <-chan struct{}
}

func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChannel <-chan struct{}) (*DataPlane, error) {
Expand Down Expand Up @@ -335,6 +344,9 @@ func (dp *DataPlane) applyDataPlaneNow(context string) error {
}
klog.Infof("[DataPlane] [ApplyDataPlane] [%s] finished applying ipsets", context)

// see comment in RemovePolicy() for why this is here
dp.setRemovePolicyFailure(false)

if dp.applyInBackground {
dp.applyInfo.Lock()
dp.applyInfo.numBatches = 0
Expand Down Expand Up @@ -472,26 +484,17 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
}
}

if !util.IsWindowsDP() {
for _, netPol := range netPols {
if !(netPol.HasCIDRRules() && dp.ipsetMgr.PreviousApplyFailed()) {
continue
}

if inBootupPhase {
// this should never happen because bootup phase is for windows, but just in case, we don't want to applyDataplaneNow() or else there will be a deadlock on dp.applyInfo
msg := fmt.Sprintf("[DataPlane] [%s] at risk of improperly applying a CIDR policy which is removed then readded", contextAddNetPolCIDRPrecaution)
klog.Warning(msg)
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, msg)
break
}

if dp.hadRemovePolicyFailure() {
if inBootupPhase {
// this should never happen because bootup phase is for windows, but just in case, we don't want to applyDataplaneNow() or else there will be a deadlock on dp.applyInfo
msg := fmt.Sprintf("[DataPlane] [%s] at risk of improperly applying a policy which is removed then readded", contextAddNetPolPrecaution)
klog.Warning(msg)
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, msg)
} else {
// prevent #2977
if err := dp.applyDataPlaneNow(contextAddNetPolCIDRPrecaution); err != nil {
if err := dp.applyDataPlaneNow(contextAddNetPolPrecaution); err != nil {
return err // nolint:wrapcheck // unnecessary to wrap error since the provided context is included in the error
}

break
}
}

Expand Down Expand Up @@ -531,6 +534,9 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
}
klog.Infof("[DataPlane] [%s] finished applying ipsets", contextAddNetPolBootup)

// see comment in RemovePolicy() for why this is here
dp.setRemovePolicyFailure(false)

dp.applyInfo.numBatches = 0
}

Expand Down Expand Up @@ -627,7 +633,16 @@ func (dp *DataPlane) RemovePolicy(policyKey string) error {
return err
}

return dp.applyDataPlaneNow(contextApplyDP)
if err := dp.applyDataPlaneNow(contextDelNetPol); err != nil {
// Failed to apply IPSets while removing this policy.
// Consider this removepolicy call a failure until apply IPSets is successful.
// Related to #2977
klog.Info("[DataPlane] remove policy has failed to apply ipsets. setting remove policy failure")
dp.setRemovePolicyFailure(true)
return err // nolint:wrapcheck // unnecessary to wrap error since the provided context is included in the error
}

return nil
}

// UpdatePolicy takes in updated policy object, calculates the delta and applies changes
Expand Down Expand Up @@ -749,3 +764,23 @@ func (dp *DataPlane) deleteIPSetsAndReferences(sets []*ipsets.TranslatedIPSet, n
}
return nil
}

func (dp *DataPlane) setRemovePolicyFailure(failed bool) {
if util.IsWindowsDP() {
return
}

dp.removePolicyInfo.Lock()
defer dp.removePolicyInfo.Unlock()
dp.removePolicyInfo.previousRemovePolicyIPSetsFailed = failed
}

func (dp *DataPlane) hadRemovePolicyFailure() bool {
if util.IsWindowsDP() {
return false
}

dp.removePolicyInfo.Lock()
defer dp.removePolicyInfo.Unlock()
return dp.removePolicyInfo.previousRemovePolicyIPSetsFailed
}
7 changes: 0 additions & 7 deletions npm/pkg/dataplane/ipsets/ipsetmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,6 @@ func NewIPSetManager(iMgrCfg *IPSetManagerCfg, ioShim *common.IOShim) *IPSetMana
}
}

// PreviousApplyFailed is only relevant for Linux right now since Windows doesn't track failures
func (iMgr *IPSetManager) PreviousApplyFailed() bool {
iMgr.Lock()
defer iMgr.Unlock()
return iMgr.consecutiveApplyFailures > 0
}

/*
Reconcile removes empty/unreferenced sets from the cache.
For ApplyAllIPSets mode, those sets are added to the toDeleteCache.
Expand Down

0 comments on commit 60ddd35

Please sign in to comment.