-
Notifications
You must be signed in to change notification settings - Fork 0
/
metrics.go
241 lines (205 loc) · 6.02 KB
/
metrics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// metrics.go
// Copyright (C) 2021 rmelo <Ricardo Melo <[email protected]>>
//
// Distributed under terms of the MIT license.
package main
import (
"context"
"fmt"
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.uber.org/zap"
)
// Metrics represents all Prometheus metrics for the application
type Metrics struct {
// Operation metrics
operationErrors *prometheus.CounterVec
operationDuration *prometheus.HistogramVec
// Helm metrics
chartVersions *prometheus.GaugeVec
chartOverdue *prometheus.GaugeVec
// Repository metrics
repoErrors *prometheus.CounterVec
lastRepoSync *prometheus.GaugeVec
// Server metrics
serverUptime prometheus.Gauge
}
// Global metrics instance
var metrics *Metrics
// newMetrics creates and registers all metrics
func newMetrics() *Metrics {
return &Metrics{
operationErrors: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "helm_monitor",
Name: "operation_errors_total",
Help: "Total number of failed operations",
},
[]string{"operation", "error_type"},
),
operationDuration: promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "helm_monitor",
Name: "operation_duration_seconds",
Help: "Duration of operations in seconds",
Buckets: []float64{.1, .25, .5, 1, 2.5, 5, 10, 30},
},
[]string{"operation"},
),
chartVersions: promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "helm_monitor",
Name: "chart_versions_available",
Help: "Number of available versions for each chart",
},
[]string{"chart", "namespace"},
),
chartOverdue: promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "helm_monitor",
Name: "chart_versions_overdue",
Help: "Number of versions a chart is behind latest",
},
[]string{"chart", "namespace", "current_version"},
),
repoErrors: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "helm_monitor",
Name: "repo_errors_total",
Help: "Total number of repository operation errors",
},
[]string{"repo", "operation"},
),
lastRepoSync: promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "helm_monitor",
Name: "repo_last_sync_timestamp",
Help: "Timestamp of last successful repository sync",
},
[]string{"repo"},
),
serverUptime: promauto.NewGauge(
prometheus.GaugeOpts{
Namespace: "helm_monitor",
Name: "server_uptime_seconds",
Help: "Time since the server started in seconds",
},
),
}
}
// Initialize metrics in init()
func init() {
metrics = newMetrics()
}
// Helper functions for recording metrics
func recordOperationError(operation, errorType string) {
metrics.operationErrors.WithLabelValues(operation, errorType).Inc()
}
func recordOperationDuration(operation string, duration float64) {
metrics.operationDuration.WithLabelValues(operation).Observe(duration)
}
func recordChartVersions(chart, namespace string, count float64) {
metrics.chartVersions.WithLabelValues(chart, namespace).Set(count)
}
func recordChartOverdue(chart, namespace, currentVersion string, overdueCount float64) {
metrics.chartOverdue.WithLabelValues(chart, namespace, currentVersion).Set(overdueCount)
}
func recordRepoError(repo, operation string) {
metrics.repoErrors.WithLabelValues(repo, operation).Inc()
}
func recordRepoSync(repo string, timestamp float64) {
metrics.lastRepoSync.WithLabelValues(repo).Set(timestamp)
}
// updateUptimeMetric updates the server uptime metric
func updateUptimeMetric(startTime time.Time) {
metrics.serverUptime.Set(time.Since(startTime).Seconds())
}
// recordMetrics periodically updates dynamic metrics
func recordMetrics(ctx context.Context, logger *zap.Logger, startTime time.Time) {
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
updateUptimeMetric(startTime)
// Update chart metrics
if len(helmMetrics) != 0 {
for _, metric := range helmMetrics {
for chartName, chartData := range metric {
recordChartOverdue(
chartName,
chartData.Namespace,
chartData.ChartVersion,
chartData.Overdue,
)
}
}
}
}
}
}
// exposeMetric starts the metrics server and handles graceful shutdown
func exposeMetric(ctx context.Context, logger *zap.Logger) error {
startTime := time.Now()
// Wait for initial metrics
timeout := time.After(time.Minute * 2)
for {
if len(helmMetrics) != 0 {
break
}
select {
case <-ctx.Done():
return ctx.Err()
case <-timeout:
return fmt.Errorf("timeout waiting for initial metrics")
default:
logger.Debug("waiting for initial metrics...")
time.Sleep(2 * time.Second)
}
}
// Start recording metrics
go recordMetrics(ctx, logger, startTime)
// Configure metrics server
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
// Add health check endpoint
mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
if len(helmMetrics) == 0 {
w.WriteHeader(http.StatusServiceUnavailable)
return
}
w.WriteHeader(http.StatusOK)
})
server := &http.Server{
Addr: ":2112",
Handler: mux,
}
// Channel to capture server errors
serverError := make(chan error, 1)
go func() {
logger.Info("starting metrics server",
zap.String("address", "http://0.0.0.0:2112/metrics"))
if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
serverError <- fmt.Errorf("metrics server error: %w", err)
}
}()
// Wait for context cancellation or server error
select {
case err := <-serverError:
return err
case <-ctx.Done():
// Graceful shutdown
shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
logger.Info("shutting down metrics server")
if err := server.Shutdown(shutdownCtx); err != nil {
return fmt.Errorf("error shutting down metrics server: %w", err)
}
return ctx.Err()
}
}