Skip to content

Commit

Permalink
Support prometheus metrics
Browse files Browse the repository at this point in the history
certmanager_csi_certificate_request_expiration_timestamp_seconds
certmanager_csi_certificate_request_ready_status
certmanager_csi_certificate_request_renewal_timestamp_seconds
certmanager_csi_driver_issue_call_count
certmanager_csi_driver_issue_error_count
certmanager_csi_managed_certificate_count
certmanager_csi_managed_volume_count

fixes: cert-manager#60
Signed-off-by: Jing Liu <[email protected]>
  • Loading branch information
7ing committed Nov 22, 2024
1 parent c552f3a commit c381e55
Show file tree
Hide file tree
Showing 9 changed files with 951 additions and 18 deletions.
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/container-storage-interface/spec v1.10.0
github.com/go-logr/logr v1.4.2
github.com/kubernetes-csi/csi-lib-utils v0.19.0
github.com/prometheus/client_golang v1.20.4
github.com/stretchr/testify v1.9.0
google.golang.org/grpc v1.66.2
k8s.io/apimachinery v0.31.1
Expand Down Expand Up @@ -42,6 +43,7 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/moby/sys/mountinfo v0.7.2 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
Expand All @@ -51,7 +53,6 @@ require (
github.com/opencontainers/runtime-spec v1.2.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.20.4 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.59.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
Expand Down
53 changes: 46 additions & 7 deletions manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import (
internalapi "github.com/cert-manager/csi-lib/internal/api"
internalapiutil "github.com/cert-manager/csi-lib/internal/api/util"
"github.com/cert-manager/csi-lib/metadata"
"github.com/cert-manager/csi-lib/metrics"
"github.com/cert-manager/csi-lib/storage"
)

Expand Down Expand Up @@ -89,6 +90,9 @@ type Options struct {

// RenewalBackoffConfig configures the exponential backoff applied to certificate renewal failures.
RenewalBackoffConfig *wait.Backoff

// Metrics is used for exposing Prometheus metrics
Metrics *metrics.Metrics
}

// NewManager constructs a new manager used to manage volumes containing
Expand Down Expand Up @@ -126,6 +130,9 @@ func NewManager(opts Options) (*Manager, error) {
if opts.Log == nil {
return nil, errors.New("log must be set")
}
if opts.Metrics == nil {
opts.Metrics = metrics.New(opts.Log)
}
if opts.MetadataReader == nil {
return nil, errors.New("MetadataReader must be set")
}
Expand Down Expand Up @@ -241,6 +248,7 @@ func NewManager(opts Options) (*Manager, error) {
metadataReader: opts.MetadataReader,
clock: opts.Clock,
log: *opts.Log,
metrics: opts.Metrics,

generatePrivateKey: opts.GeneratePrivateKey,
generateRequest: opts.GenerateRequest,
Expand Down Expand Up @@ -375,6 +383,9 @@ type Manager struct {
// No thread safety is added around this field, and it MUST NOT be used for any implementation logic.
// It should not be used full-stop :).
doNotUse_CallOnEachIssue func()

// metrics is used to expose Prometheus
metrics *metrics.Metrics
}

// issue will step through the entire issuance flow for a volume.
Expand All @@ -387,6 +398,9 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
log := m.log.WithValues("volume_id", volumeID)
log.Info("Processing issuance")

// Increase issue count
m.metrics.IncrementIssueCallCount(m.nodeNameHash, volumeID)

if err := m.cleanupStaleRequests(ctx, log, volumeID); err != nil {
return fmt.Errorf("cleaning up stale requests: %w", err)
}
Expand Down Expand Up @@ -594,7 +608,7 @@ func (m *Manager) handleRequest(ctx context.Context, volumeID string, meta metad
// Calculate the default next issuance time.
// The implementation's writeKeypair function may override this value before
// writing to the storage layer.
renewalPoint, err := calculateNextIssuanceTime(req.Status.Certificate)
expiryPoint, renewalPoint, err := getExpiryAndDefaultNextIssuanceTime(req.Status.Certificate)
if err != nil {
return fmt.Errorf("calculating next issuance time: %w", err)
}
Expand All @@ -606,6 +620,10 @@ func (m *Manager) handleRequest(ctx context.Context, volumeID string, meta metad
}
log.V(2).Info("Wrote new keypair to storage")

// Update the request metrics.
// Using meta.NextIssuanceTime instead of renewalPoint here, in case writeKeypair overrides the value.
m.metrics.UpdateCertificateRequest(req, expiryPoint, *meta.NextIssuanceTime)

// We must explicitly delete the private key from the pending requests map so that the existing Completed
// request will not be re-used upon renewal.
// Without this, the renewal would pick up the existing issued certificate and re-issue, rather than requesting
Expand Down Expand Up @@ -657,6 +675,9 @@ func (m *Manager) cleanupStaleRequests(ctx context.Context, log logr.Logger, vol
}
}

// Remove the CertificateRequest from the metrics.
m.metrics.RemoveCertificateRequest(toDelete.Name, toDelete.Namespace)

log.Info("Deleted CertificateRequest resource", "name", toDelete.Name, "namespace", toDelete.Namespace)
}

Expand Down Expand Up @@ -756,6 +777,8 @@ func (m *Manager) ManageVolumeImmediate(ctx context.Context, volumeID string) (m
// If issuance fails, immediately return without retrying so the caller can decide
// how to proceed depending on the context this method was called within.
if err := m.issue(ctx, volumeID); err != nil {
// Increase issue error count
m.metrics.IncrementIssueErrorCount(m.nodeNameHash, volumeID)
return true, err
}
}
Expand Down Expand Up @@ -783,6 +806,8 @@ func (m *Manager) manageVolumeIfNotManaged(volumeID string) (managed bool) {
// construct a new channel used to stop management of the volume
stopCh := make(chan struct{})
m.managedVolumes[volumeID] = stopCh
// Increase managed volume count for this driver
m.metrics.IncrementManagedVolumeCount(m.nodeNameHash)

return true
}
Expand All @@ -800,6 +825,10 @@ func (m *Manager) startRenewalRoutine(volumeID string) (started bool) {
return false
}

// Increase managed certificate count for this driver.
// We assume each volume will have one certificate to be managed.
m.metrics.IncrementManagedCertificateCount(m.nodeNameHash)

// Create a context that will be cancelled when the stopCh is closed
ctx, cancel := context.WithCancel(context.Background())
go func() {
Expand Down Expand Up @@ -835,6 +864,8 @@ func (m *Manager) startRenewalRoutine(volumeID string) (started bool) {
defer issueCancel()
if err := m.issue(issueCtx, volumeID); err != nil {
log.Error(err, "Failed to issue certificate, retrying after applying exponential backoff")
// Increase issue error count
m.metrics.IncrementIssueErrorCount(m.nodeNameHash, volumeID)
return false, nil
}
return true, nil
Expand Down Expand Up @@ -874,6 +905,14 @@ func (m *Manager) UnmanageVolume(volumeID string) {
if stopCh, ok := m.managedVolumes[volumeID]; ok {
close(stopCh)
delete(m.managedVolumes, volumeID)
if reqs, err := m.listAllRequestsForVolume(volumeID); err == nil {
// Remove the CertificateRequest from the metrics with the best effort.
for _, req := range reqs {
if req != nil {
m.metrics.RemoveCertificateRequest(req.Name, req.Namespace)
}
}
}
}
}

Expand Down Expand Up @@ -919,19 +958,19 @@ func (m *Manager) Stop() {
}
}

// calculateNextIssuanceTime will return the default time at which the certificate
// should be renewed by the driver- 2/3rds through its lifetime (NotAfter -
// NotBefore).
func calculateNextIssuanceTime(chain []byte) (time.Time, error) {
// getExpiryAndDefaultNextIssuanceTime will return the certificate expiry time, together with
// default time at which the certificate should be renewed by the driver- 2/3rds through its
// lifetime (NotAfter - NotBefore).
func getExpiryAndDefaultNextIssuanceTime(chain []byte) (time.Time, time.Time, error) {
block, _ := pem.Decode(chain)
crt, err := x509.ParseCertificate(block.Bytes)
if err != nil {
return time.Time{}, fmt.Errorf("parsing issued certificate: %w", err)
return time.Time{}, time.Time{}, fmt.Errorf("parsing issued certificate: %w", err)
}

actualDuration := crt.NotAfter.Sub(crt.NotBefore)

renewBeforeNotAfter := actualDuration / 3

return crt.NotAfter.Add(-renewBeforeNotAfter), nil
return crt.NotAfter, crt.NotAfter.Add(-renewBeforeNotAfter), nil
}
17 changes: 10 additions & 7 deletions manager/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ func TestManager_cleanupStaleRequests(t *testing.T) {
}
}

func Test_calculateNextIssuanceTime(t *testing.T) {
func Test_getExpiryAndDefaultNextIssuanceTime(t *testing.T) {
notBefore := time.Date(1970, time.January, 1, 0, 0, 0, 0, time.UTC)
notAfter := time.Date(1970, time.January, 4, 0, 0, 0, 0, time.UTC)
pk, err := rsa.GenerateKey(rand.Reader, 2048)
Expand All @@ -474,20 +474,23 @@ func Test_calculateNextIssuanceTime(t *testing.T) {
certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes})

tests := map[string]struct {
expTime time.Time
expErr bool
expTime time.Time
renewTime time.Time
expErr bool
}{
"if no attributes given, return 2/3rd certificate lifetime": {
expTime: notBefore.AddDate(0, 0, 2),
expErr: false,
expTime: notAfter,
renewTime: notBefore.AddDate(0, 0, 2),
expErr: false,
},
}

for name, test := range tests {
t.Run(name, func(t *testing.T) {
renewTime, err := calculateNextIssuanceTime(certPEM)
expTime, renewTime, err := getExpiryAndDefaultNextIssuanceTime(certPEM)
assert.Equal(t, test.expErr, err != nil)
assert.Equal(t, test.expTime, renewTime)
assert.Equal(t, test.expTime, expTime)
assert.Equal(t, test.renewTime, renewTime)
})
}
}
Expand Down
102 changes: 102 additions & 0 deletions metrics/certificaterequest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
Copyright 2024 The cert-manager Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"time"

"github.com/prometheus/client_golang/prometheus"

cmapi "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1"
cmmeta "github.com/cert-manager/cert-manager/pkg/apis/meta/v1"
)

var readyConditionStatuses = [...]cmmeta.ConditionStatus{
cmmeta.ConditionTrue,
cmmeta.ConditionFalse,
cmmeta.ConditionUnknown,
}

// UpdateCertificateRequest will update the given CertificateRequest's metrics for its expiry, renewal, and status condition.
func (m *Metrics) UpdateCertificateRequest(cr *cmapi.CertificateRequest, exp, renewal time.Time) {
m.updateCertificateRequestExpiryAndRenewalTime(cr, exp, renewal)
m.updateCertificateRequestStatus(cr)
}

// updateCertificateRequestExpiryAndRenewalTime updates the expiry and renewal time of a certificate request
func (m *Metrics) updateCertificateRequestExpiryAndRenewalTime(cr *cmapi.CertificateRequest, exp, renewal time.Time) {
expiryTime := 0.0
if !exp.IsZero() {
expiryTime = float64(exp.Unix())
}
m.certificateRequestExpiryTimeSeconds.With(prometheus.Labels{
"name": cr.Name,
"namespace": cr.Namespace,
"issuer_name": cr.Spec.IssuerRef.Name,
"issuer_kind": cr.Spec.IssuerRef.Kind,
"issuer_group": cr.Spec.IssuerRef.Group}).Set(expiryTime)

renewalTime := 0.0
if !renewal.IsZero() {
renewalTime = float64(renewal.Unix())
}
m.certificateRequestRenewalTimeSeconds.With(prometheus.Labels{
"name": cr.Name,
"namespace": cr.Namespace,
"issuer_name": cr.Spec.IssuerRef.Name,
"issuer_kind": cr.Spec.IssuerRef.Kind,
"issuer_group": cr.Spec.IssuerRef.Group}).Set(renewalTime)
}

// updateCertificateRequestStatus will update the metric for that Certificate Request
func (m *Metrics) updateCertificateRequestStatus(cr *cmapi.CertificateRequest) {
for _, c := range cr.Status.Conditions {
if c.Type == cmapi.CertificateRequestConditionReady {
m.updateCertificateRequestReadyStatus(cr, c.Status)
return
}
}

// If no status condition set yet, set to Unknown
m.updateCertificateRequestReadyStatus(cr, cmmeta.ConditionUnknown)
}

func (m *Metrics) updateCertificateRequestReadyStatus(cr *cmapi.CertificateRequest, current cmmeta.ConditionStatus) {
for _, condition := range readyConditionStatuses {
value := 0.0

if current == condition {
value = 1.0
}

m.certificateRequestReadyStatus.With(prometheus.Labels{
"name": cr.Name,
"namespace": cr.Namespace,
"condition": string(condition),
"issuer_name": cr.Spec.IssuerRef.Name,
"issuer_kind": cr.Spec.IssuerRef.Kind,
"issuer_group": cr.Spec.IssuerRef.Group,
}).Set(value)
}
}

// RemoveCertificateRequest will delete the CertificateRequest metrics from continuing to be exposed.
func (m *Metrics) RemoveCertificateRequest(name, namespace string) {
m.certificateRequestExpiryTimeSeconds.DeletePartialMatch(prometheus.Labels{"name": name, "namespace": namespace})
m.certificateRequestRenewalTimeSeconds.DeletePartialMatch(prometheus.Labels{"name": name, "namespace": namespace})
m.certificateRequestReadyStatus.DeletePartialMatch(prometheus.Labels{"name": name, "namespace": namespace})
}
Loading

0 comments on commit c381e55

Please sign in to comment.