Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCPBUGS-8446: MCO-503: daemon: have a special path to sync in certs #3575

Merged
merged 4 commits into from
Mar 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/machine-config-daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ func runStartCmd(cmd *cobra.Command, args []string) {
kubeClient,
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigs(),
ctx.KubeInformerFactory.Core().V1().Nodes(),
ctx.InformerFactory.Machineconfiguration().V1().ControllerConfigs(),
startOpts.kubeletHealthzEnabled,
startOpts.kubeletHealthzEndpoint,
)
Expand Down
24 changes: 0 additions & 24 deletions install/0000_90_machine-config-operator_01_prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,6 @@ metadata:
include.release.openshift.io/single-node-developer: "true"
spec:
groups:
- name: mcc-paused-pool-kubelet-ca
rules:
- alert: MachineConfigControllerPausedPoolKubeletCA
expr: |
max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m])) > 0
for: 60m
labels:
namespace: openshift-machine-config-operator
severity: warning
annotations:
summary: "Paused machine configuration pool '{{$labels.pool}}' is blocking a necessary certificate rotation and must be unpaused before the current kube-apiserver-to-kubelet-signer certificate expires on {{ $value | humanizeTimestamp }}."
description: "Machine config pools have a 'pause' feature, which allows config to be rendered, but prevents it from being rolled out to the nodes. This alert indicates that a certificate rotation has taken place, and the new kubelet-ca certificate bundle has been rendered into a machine config, but because the pool '{{$labels.pool}}' is paused, the config cannot be rolled out to the nodes in that pool. You will notice almost immediately that for nodes in pool '{{$labels.pool}}', pod logs will not be visible in the console and interactive commands (oc log, oc exec, oc debug, oc attach) will not work. You must unpause machine config pool '{{$labels.pool}}' to let the certificates through before the kube-apiserver-to-kubelet-signer certificate expires on {{ $value | humanizeTimestamp }} or this pool's nodes will cease to function properly."
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/machine-config-operator/MachineConfigControllerPausedPoolKubeletCA.md
- alert: MachineConfigControllerPausedPoolKubeletCA
expr: |
max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m]) - time()) < (86400 * 14) AND max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m])) > 0
for: 60m
labels:
namespace: openshift-machine-config-operator
severity: critical
annotations:
summary: "Paused machine configuration pool '{{$labels.pool}}' is blocking a necessary certificate rotation and must be unpaused before the current kube-apiserver-to-kubelet-signer certificate expires in {{ $value | humanizeDuration }}."
description: "Machine config pools have a 'pause' feature, which allows config to be rendered, but prevents it from being rolled out to the nodes. This alert indicates that a certificate rotation has taken place, and the new kubelet-ca certificate bundle has been rendered into a machine config, but because the pool '{{$labels.pool}}' is paused, the config cannot be rolled out to the nodes in that pool. You will notice almost immediately that for nodes in pool '{{$labels.pool}}', pod logs will not be visible in the console and interactive commands (oc log, oc exec, oc debug, oc attach) will not work. You must unpause machine config pool '{{$labels.pool}}' to let the certificates through before the kube-apiserver-to-kubelet-signer certificate expires. You have approximately {{ $value | humanizeDuration }} remaining before this happens and nodes in '{{$labels.pool}}' cease to function properly."
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/machine-config-operator/MachineConfigControllerPausedPoolKubeletCA.md
- name: os-image-override.rules
rules:
- expr: sum(os_image_url_override)
Expand Down
2 changes: 1 addition & 1 deletion manifests/machineconfigdaemon/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rules:
resources: ["nodes"]
verbs: ["get", "list", "watch"]
- apiGroups: ["machineconfiguration.openshift.io"]
resources: ["machineconfigs"]
resources: ["machineconfigs", "controllerconfigs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["security.openshift.io"]
resourceNames: ["privileged"]
Expand Down
3 changes: 3 additions & 0 deletions manifests/machineconfigserver/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ rules:
- apiGroups: ["machineconfiguration.openshift.io"]
resources: ["machineconfigs", "machineconfigpools"]
verbs: ["*"]
- apiGroups: ["machineconfiguration.openshift.io"]
resources: ["controllerconfigs"]
verbs: ["get", "watch", "list"]
- apiGroups: ["security.openshift.io"]
resourceNames: ["hostnetwork"]
resources: ["securitycontextconstraints"]
Expand Down
49 changes: 0 additions & 49 deletions pkg/controller/common/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@ import (
"bytes"
"compress/gzip"
"context"
"crypto/x509"
"encoding/base64"
"encoding/pem"
"errors"
"fmt"
"io"
Expand Down Expand Up @@ -963,53 +961,6 @@ func GetIgnitionFileDataByPath(config *ign3types.Config, path string) ([]byte, e
return nil, nil
}

// GetNewestCertificatesFromPEMBundle breaks a pem-encoded bundle out into its component certificates
func GetCertificatesFromPEMBundle(pemBytes []byte) ([]*x509.Certificate, error) {
var certs []*x509.Certificate
// There can be multiple certificates in the file
for {
// Decode a block to parse
block, rest := pem.Decode(pemBytes)
// Once we get no more blocks, we've read all the certs
if block == nil {
break
}
// Right now we just care about certificates, not keys
if block.Type == "CERTIFICATE" {
cert, err := x509.ParseCertificate(block.Bytes)
if err != nil {
// This isn't fatal, *this* cert could just be junk, next one could be okay
glog.Warningf("Failed to parse certificate: %v", err.Error())
} else {
certs = append(certs, cert)
}
}
// Keep reading from where we left off
pemBytes = rest
}
return certs, nil
}

// GetLongestValidCertificate returns the latest-expiring certificate from a given list of certificates
// whose Subject.CommonName also matches any of the given common-name prefixes
func GetLongestValidCertificate(certificateList []*x509.Certificate, subjectPrefixes []string) *x509.Certificate {
// Sort is smallest-to-largest, so we're putting the cert with the latest expiry date at the top
sort.Slice(certificateList, func(i, j int) bool {
return certificateList[i].NotAfter.After(certificateList[j].NotAfter)
})
// For each certificate in our list
for _, certificate := range certificateList {
// Check it against our prefixes
for _, prefix := range subjectPrefixes {
// If it matches, this is the latest-expiring one since it's closest to the "top"
if strings.HasPrefix(certificate.Subject.CommonName, prefix) {
return certificate
}
}
}
return nil
}

// GetDefaultBaseImageContainer is kind of a "soft feature gate" for using the "new format" image by default, its behavior
// is determined by the "UseNewFormatImageByDefault" boolean
func GetDefaultBaseImageContainer(cconfigspec *mcfgv1.ControllerConfigSpec) string {
Expand Down
8 changes: 0 additions & 8 deletions pkg/controller/common/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,6 @@ const (

// MCC Metrics
var (
// MachineConfigControllerPausedPoolKubeletCA logs when a certificate rotation is being held up by pause
MachineConfigControllerPausedPoolKubeletCA = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "machine_config_controller_paused_pool_kubelet_ca",
Help: "Set to the unix timestamp in utc of the current certificate expiry date if a certificate rotation is pending in specified paused pool",
}, []string{"pool"})

// OSImageURLOverride tells whether cluster is using default OS image or has been overridden by user
OSImageURLOverride = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Expand All @@ -41,7 +34,6 @@ var (

func RegisterMCCMetrics() error {
err := RegisterMetrics([]prometheus.Collector{
MachineConfigControllerPausedPoolKubeletCA,
OSImageURLOverride,
MCCDrainErr,
})
Expand Down
127 changes: 0 additions & 127 deletions pkg/controller/node/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@ package node

import (
"context"
"crypto/x509"
"encoding/json"
"fmt"
"reflect"
"sort"
"time"

ign3types "github.com/coreos/ignition/v2/config/v3_2/types"
"github.com/golang/glog"
configv1 "github.com/openshift/api/config/v1"
cligoinformersv1 "github.com/openshift/client-go/config/informers/externalversions/config/v1"
Expand All @@ -26,7 +24,6 @@ import (
mcfglistersv1 "github.com/openshift/machine-config-operator/pkg/generated/listers/machineconfiguration.openshift.io/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -68,15 +65,8 @@ const (

// schedulerCRName that we're interested in watching.
schedulerCRName = "cluster"

// kubeletCAFilePath is the expected file path for the kubelet ca
kubeletCAFilePath = "/etc/kubernetes/kubelet-ca.crt"
)

// kubeAPIToKubeletSignerNamePrefixes is the list of subject common names that are regarded as a kube-apiserver-to-kubelet-signer ca certificate
// Based on naming convention from https://github.com/openshift/library-go/blob/ed9bc958bd8a2fff079d52976806e4e0a8a7c315/pkg/operator/certrotation/signer.go#L132
var kubeAPIToKubeletSignerNamePrefixes = []string{"openshift-kube-apiserver-operator_kube-apiserver-to-kubelet-signer@", "kube-apiserver-to-kubelet-signer"}

// Controller defines the node controller.
type Controller struct {
client mcfgclientset.Interface
Expand Down Expand Up @@ -779,17 +769,9 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
if mcfgv1.IsMachineConfigPoolConditionTrue(pool.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) {
glog.Infof("Pool %s is paused and will not update.", pool.Name)
}

// Only check for pending files if we're out of sync
if pool.Spec.Configuration.Name != pool.Status.Configuration.Name {
ctrl.setPendingFileMetrics(pool)
}
return ctrl.syncStatusOnly(pool)
}

// We aren't paused anymore, so reset the metrics
ctrl.resetPendingFileMetrics(pool)

nodes, err := ctrl.getNodesForPool(pool)
if err != nil {
if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil {
Expand Down Expand Up @@ -1179,112 +1161,3 @@ func getErrorString(err error) string {
}
return ""
}

// setPendingFileMetrics checks to see if there are any important files in the
// machineconfig that the pool should be moving to, and sets metrics if there are
func (ctrl *Controller) setPendingFileMetrics(pool *mcfgv1.MachineConfigPool) {
// Retrieve and parse the pool's machine config
currentConfig, pendingConfig, err := ctrl.parseConvertMachineConfigFilesForPool(pool)
if err != nil {
glog.Warningf("Error converting pool configs for %s pool: %v", pool.Name, err)
return
}

// Figure out what files differ between pool.Spec and pool.Status
fileDiff := ctrlcommon.CalculateConfigFileDiffs(currentConfig, pendingConfig)

// Go through our files until we hit the kubelet CA bundle
for _, path := range fileDiff {
// We only care about the kubelet CA bundle
if path != kubeletCAFilePath {
continue
}

// If it's there, get the *newest* (in case there have been multiple rotations) kube-apiserver-to-kubelet signer certifiate out of the bundle
newestSignerCertificate, err := ctrl.getNewestAPIToKubeletSignerCertificate(currentConfig)
if err != nil {
glog.Warningf("Error retrieving kubelet-ca certificates from pool %s: %v", pool.Name, err)
} else {
// Set the metric value to the UTC expiry date of that cert so we can count down to it
glog.V(2).Infof("Kubelet CA is stuck in paused pool %s. Setting metric to expiry date of %s (%s)", pool.Name, newestSignerCertificate.Subject.CommonName, newestSignerCertificate.NotAfter.UTC())
ctrlcommon.MachineConfigControllerPausedPoolKubeletCA.WithLabelValues(pool.Name).Set(float64(newestSignerCertificate.NotAfter.UTC().Unix()))
}
break
}
}

// resetPendingFileMetrics turns off any "paused file" metrics that were firing for the pool
func (ctrl *Controller) resetPendingFileMetrics(pool *mcfgv1.MachineConfigPool) {
// Set the metric for this pool back to zero
ctrlcommon.MachineConfigControllerPausedPoolKubeletCA.WithLabelValues(pool.Name).Set(0)
}

// parseConvertMachineConfigFilesForPool retrieves the current and pending configurations for
// a pool, parses and converts them, and returns them as ignition v3 Config objects. The controller needs
// to retrieve and examine the actual configurations so it can diff the file lists and figure out which new
// files are "stuck" behind a paused pool.
func (ctrl *Controller) parseConvertMachineConfigFilesForPool(pool *mcfgv1.MachineConfigPool) (current, pending *ign3types.Config, err error) {
// The config we're in right now
currentName := pool.Status.Configuration.Name
// The config we would be going to
pendingName := pool.Spec.Configuration.Name

// Get the machine config objects
currentConfig, err := ctrl.mcLister.Get(currentName)
if apierrors.IsNotFound(err) {
glog.V(2).Infof("MachineConfig %v has been deleted", currentName)
return nil, nil, err
}

pendingConfig, err := ctrl.mcLister.Get(pendingName)
if apierrors.IsNotFound(err) {
glog.V(2).Infof("MachineConfigPool %v has been deleted", pendingName)
return nil, nil, err
}

// Make sure we can coax the objects into ignitionv3
currentIgnConfig, err := ctrlcommon.ParseAndConvertConfig(currentConfig.Spec.Config.Raw)
if err != nil {
return nil, nil, err
}
pendingIgnConfig, err := ctrlcommon.ParseAndConvertConfig(pendingConfig.Spec.Config.Raw)
if err != nil {
return nil, nil, err
}

return &currentIgnConfig, &pendingIgnConfig, nil
}

// getNewestAPIToKubeletSignerCertificate returns the newest kube-apiserver-to-kubelet-signer
// certificate present in the kubelet-ca.crt bundle. We extract the certificate so we can use its
// expiry date in our metrics/alerting. It's a very important certificate and its expiry will cause
// nodes using it to cease communicating with the cluster.
func (ctrl *Controller) getNewestAPIToKubeletSignerCertificate(statusIgnConfig *ign3types.Config) (*x509.Certificate, error) {
// Retrieve the file data from ignition
kubeletBundle, err := ctrlcommon.GetIgnitionFileDataByPath(statusIgnConfig, kubeletCAFilePath)
if err != nil {
return nil, err
}

// Parse that bundle into its component certificates
containedCertificates, err := ctrlcommon.GetCertificatesFromPEMBundle(kubeletBundle)
if err != nil {
return nil, err
}

// We have other problems if this is empty, but it's possible
if len(containedCertificates) == 0 {
return nil, fmt.Errorf("no certificates found in bundle")
}

// The *original* signer has a different name, the rotated ones have longer names
// The suffix changes with the timstamp on rotation, which is why I'm using prefix here not exact match
newestCertificate := ctrlcommon.GetLongestValidCertificate(containedCertificates, kubeAPIToKubeletSignerNamePrefixes)

// Shouldn't come back with nothing, but just in case we do
if newestCertificate == nil {
return nil, fmt.Errorf("no matching kube-apiserver-to-kubelet-signer certificates found in bundle")
}

return newestCertificate, nil
}
Loading