Skip to content

Commit

Permalink
fix(gke): Update collect method to gracefully fail (#336)
Browse files Browse the repository at this point in the history
There is a condition where a failure to find a machine type in the price map was causing the `Collect` method to return instead of continuing. Instead, this changes the logic to log out the message within `Collect` and continue operating.
Adds a unit test to reproduce the error and then validates it is resolved.

A future improvement here would be to
1. Export a metric
2. Set a sane default value(if provided?)

- closes #335
  • Loading branch information
Pokom authored Oct 22, 2024
1 parent 8716868 commit f52c86f
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 2 deletions.
28 changes: 26 additions & 2 deletions pkg/google/gke/gke.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,13 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
clusterName := instance.GetClusterName()
// We skip instances that do not have a clusterName because they are not associated with an GKE cluster
if clusterName == "" {
c.logger.LogAttrs(ctx,
slog.LevelDebug,
"instance does not have a clustername",
slog.String("region", instance.Region),
slog.String("machine_type", instance.MachineType),
slog.String("project", project),
)
continue
}
labelValues := []string{
Expand All @@ -158,7 +165,16 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
}
cpuCost, ramCost, err := c.PricingMap.GetCostOfInstance(instance)
if err != nil {
return err
// Log out the error and continue processing nodes
// TODO(@pokom): Should we set sane defaults here to emit _something_?
c.logger.LogAttrs(ctx,
slog.LevelError,
err.Error(),
slog.String("machine_type", instance.MachineType),
slog.String("region", instance.Region),
slog.String("project", project),
)
continue
}
ch <- prometheus.MustNewConstMetric(
gkeNodeCPUHourlyCostDesc,
Expand Down Expand Up @@ -198,7 +214,15 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) error {

price, err := c.PricingMap.GetCostOfStorage(d.Region(), d.StorageClass())
if err != nil {
fmt.Printf("%s error getting cost of storage: %v\n", disk.Name, err)
c.logger.LogAttrs(ctx,
slog.LevelError,
err.Error(),
slog.String("disk_name", disk.Name),
slog.String("project", project),
slog.String("region", d.Region()),
slog.String("cluster_name", d.Cluster),
slog.String("storage_class", d.StorageClass()),
)
continue
}
ch <- prometheus.MustNewConstMetric(
Expand Down
13 changes: 13 additions & 0 deletions pkg/google/gke/gke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,19 @@ func TestCollector_Collect(t *testing.T) {
GkeClusterLabel: "test",
},
},
{
// Add in an instance that does not have a machine type that would exist in the pricing map.
// This test replicates and fixes https://github.com/grafana/cloudcost-exporter/issues/335
Name: "test-n1-spot",
MachineType: "abc/n8-slim",
Zone: "testing/us-central1-a",
Scheduling: &computev1.Scheduling{
ProvisioningModel: "SPOT",
},
Labels: map[string]string{
GkeClusterLabel: "test",
},
},
{
Name: "test-n2-us-east1",
MachineType: "abc/n2-slim",
Expand Down

0 comments on commit f52c86f

Please sign in to comment.