From b6871aa44368b3f5d54dcae474f6e062be690b25 Mon Sep 17 00:00:00 2001 From: Achilleas Triantafyllou Date: Sun, 24 Jul 2022 17:59:05 +0300 Subject: [PATCH] Introduce requests per LB pool metric & fix LB pool health status metric (#65) * fix: Report LB pool health properly The `cloudflare_zone_pool_health_status` metric introduced with goal to report the actual pools health status during the time, regardless if this pool is the selected one or not. The initial implementation uses the `loadBalancingRequestsAdaptiveGroups` type which returns aggregated Load Balancing origin requests with adaptive sampling and contains information only about the selectedPool (Name, Health, AvgRttMs, etc), rather than for all LB pools. This leads on reporthing back the status only of the selectedPool (which normally is always 1) and not the overall status of all LB's pools. According to Cloudflare's documentation about LoadBalancing GraphQL Analytics API[1], the `loadBalancingRequestsAdaptive` schema can be used to fetch analytics about the Raw Load Balancing origin requests with adaptive sampling. These analytics exposes infromation about the selected pool like: * Selected Pool - Name/Health/ID/Average RTT * Session Affinity - Type/Status But it also exposes information about LB's pools and origins regardless the selection decision, like: * Pools ID/Name/Health/Average RTT/Selected * Origins Name/FQDN/Ipv4/Ipv6/Selected The latter metrics can be used to report the health of all pools associated with account's LBs. For this reason, this commit adds the required `loadBalancingRequestsAdaptive` schema struct on the lbResp struct and configures it as the source for the `cloudflare_zone_pool_health_status` metric. 1. https://developers.cloudflare.com/load-balancing/reference/load-balancing-analytics/#graphql-analytics * feat: Add poolRequestsTotal metric This commit introduces a prometheus counter metric about the total number of requests per pool. --- README.md | 3 ++- cloudflare.go | 68 +++++++++++++++++++++++++++++++++++++++++++++------ prometheus.go | 42 ++++++++++++++++++++++--------- 3 files changed, 92 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index acea378..4bd4f13 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,8 @@ Note: `ZONE_` configuration is not supported as flag. # HELP cloudflare_zone_threats_country Threats per zone per country # HELP cloudflare_zone_threats_total Threats per zone # HELP cloudflare_zone_uniques_total Uniques per zone -# HELP cloudflare_zone_pool_health_status Reports the health of a pool, 1 for healthy, 0 for unhealthy. +# HELP cloudflare_zone_pool_health_status Reports the health of a pool, 1 for healthy, 0 for unhealthy +# HELP cloudflare_zone_pool_requests_total Requests per pool ``` ## Helm chart repository diff --git a/cloudflare.go b/cloudflare.go index 79df6bc..abd5aa6 100644 --- a/cloudflare.go +++ b/cloudflare.go @@ -183,15 +183,41 @@ type lbResp struct { LoadBalancingRequestsAdaptiveGroups []struct { Count uint64 `json:"count"` Dimensions struct { - ColoCode string `json:"coloCode"` - LbName string `json:"lbName"` - Region string `json:"region"` - SelectedOriginName string `json:"selectedOriginName"` - SelectedPoolHealthy int `json:"selectedPoolHealthy"` - SelectedPoolName string `json:"selectedPoolName"` - SteeringPolicy string `json:"steeringPolicy"` + LbName string `json:"lbName"` + Proxied uint8 `json:"proxied"` + Region string `json:"region"` + SelectedOriginName string `json:"selectedOriginName"` + SelectedPoolAvgRttMs uint64 `json:"selectedPoolAvgRttMs"` + SelectedPoolHealthy uint8 `json:"selectedPoolHealthy"` + SelectedPoolName string `json:"selectedPoolName"` + SteeringPolicy string `json:"steeringPolicy"` } `json:"dimensions"` } `json:"loadBalancingRequestsAdaptiveGroups"` + + LoadBalancingRequestsAdaptive []struct { + LbName string `json:"lbName"` + Proxied uint8 `json:"proxied"` + Region string `json:"region"` + SelectedPoolHealthy uint8 `json:"selectedPoolHealthy"` + SelectedPoolID string `json:"selectedPoolID"` + SelectedPoolName string `json:"selectedPoolName"` + SessionAffinityStatus string `json:"sessionAffinityStatus"` + SteeringPolicy string `json:"steeringPolicy"` + SelectedPoolAvgRttMs uint64 `json:"selectedPoolAvgRttMs"` + Pools []struct { + AvgRttMs uint64 `json:"avgRttMs"` + Healthy uint8 `json:"healthy"` + ID string `json:"id"` + PoolName string `json:"poolName"` + } `json:"pools"` + Origins []struct { + OriginName string `json:"originName"` + Health uint8 `json:"health"` + IPv4 string `json:"ipv4"` + Selected uint8 `json:"selected"` + } `json:"origins"` + } `json:"loadBalancingRequestsAdaptive"` + ZoneTag string `json:"zoneTag"` } @@ -493,15 +519,41 @@ func fetchLoadBalancerTotals(zoneIDs []string) (*cloudflareResponseLb, error) { limit: $limit) { count dimensions { - coloCode region lbName selectedPoolName + proxied selectedOriginName + selectedPoolAvgRttMs selectedPoolHealthy steeringPolicy } } + loadBalancingRequestsAdaptive( + filter: { datetime_geq: $mintime, datetime_lt: $maxtime}, + limit: $limit) { + lbName + proxied + region + selectedPoolHealthy + selectedPoolId + selectedPoolName + sessionAffinityStatus + steeringPolicy + selectedPoolAvgRttMs + pools { + id + poolName + healthy + avgRttMs + } + origins { + originName + health + ipv4 + selected + } + } } } } diff --git a/prometheus.go b/prometheus.go index 93eeb01..02a2bf3 100644 --- a/prometheus.go +++ b/prometheus.go @@ -174,12 +174,18 @@ var ( }, []string{"script_name", "quantile"}, ) - poolHealthStatus = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "cloudflare_zone_pool_health_status", - Help: "Reports the health of a pool, 1 for healthy, 0 for unhealthy.", - }, - []string{"zone", "colo_code", "load_balancer_name", "origin_name", "steering_policy", "pool_name", "region"}, + poolHealthStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cloudflare_zone_pool_health_status", + Help: "Reports the health of a pool, 1 for healthy, 0 for unhealthy.", + }, + []string{"zone", "load_balancer_name", "pool_name"}, + ) + + poolRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "cloudflare_zone_pool_requests_total", + Help: "Requests per pool", + }, + []string{"zone", "load_balancer_name", "pool_name", "origin_name"}, ) ) @@ -384,6 +390,7 @@ func fetchLoadBalancerAnalytics(zones []cloudflare.Zone, wg *sync.WaitGroup) { } for _, lb := range l.Viewer.Zones { name := findZoneName(zones, lb.ZoneTag) + addLoadBalancingRequestsAdaptive(&lb, name) addLoadBalancingRequestsAdaptiveGroups(&lb, name) } } @@ -391,16 +398,27 @@ func fetchLoadBalancerAnalytics(zones []cloudflare.Zone, wg *sync.WaitGroup) { func addLoadBalancingRequestsAdaptiveGroups(z *lbResp, name string) { for _, g := range z.LoadBalancingRequestsAdaptiveGroups { - poolHealthStatus.With( + poolRequestsTotal.With( prometheus.Labels{ "zone": name, - "colo_code": g.Dimensions.ColoCode, "load_balancer_name": g.Dimensions.LbName, - "origin_name": g.Dimensions.SelectedOriginName, - "steering_policy": g.Dimensions.SteeringPolicy, "pool_name": g.Dimensions.SelectedPoolName, - "region": g.Dimensions.Region, - }).Set(float64(g.Dimensions.SelectedPoolHealthy)) + "origin_name": g.Dimensions.SelectedOriginName, + }).Add(float64(g.Count)) + } +} + +func addLoadBalancingRequestsAdaptive(z *lbResp, name string) { + + for _, g := range z.LoadBalancingRequestsAdaptive { + for _, p := range g.Pools { + poolHealthStatus.With( + prometheus.Labels{ + "zone": name, + "load_balancer_name": g.LbName, + "pool_name": p.PoolName, + }).Set(float64(p.Healthy)) + } } }