From d2b4d45eeeed9bdd1b8486d3e686ae348e7af504 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 11 Oct 2024 10:42:11 -0700
Subject: [PATCH] agent: Add scaling event reporting

This is part 2 of 2; see #1078 for the ground work.

In short, this commit:

* Adds a new package: 'pkg/agent/scalingevents'
* Adds new callbacks to core.State to allow it to report on scaling
  events changes in desired CU.
---
 autoscaler-agent/config_map.yaml        |   7 ++
 pkg/agent/config.go                     |  17 ++-
 pkg/agent/core/goalcu.go                |  45 +++++---
 pkg/agent/core/state.go                 |  28 +++++
 pkg/agent/core/state_test.go            |   6 +
 pkg/agent/core/testhelpers/construct.go |   2 +
 pkg/agent/entrypoint.go                 |   9 +-
 pkg/agent/globalstate.go                |   6 +
 pkg/agent/runner.go                     | 107 ++++++++++++++++++
 pkg/agent/scalingevents/clients.go      |  64 +++++++++++
 pkg/agent/scalingevents/prommetrics.go  |  29 +++++
 pkg/agent/scalingevents/reporter.go     | 142 ++++++++++++++++++++++++
 pkg/api/types.go                        |  17 +++
 pkg/api/vminfo.go                       |  20 ++++
 14 files changed, 480 insertions(+), 19 deletions(-)
 create mode 100644 pkg/agent/scalingevents/clients.go
 create mode 100644 pkg/agent/scalingevents/prommetrics.go
 create mode 100644 pkg/agent/scalingevents/reporter.go

diff --git a/autoscaler-agent/config_map.yaml b/autoscaler-agent/config_map.yaml
index b22c30d6b..727004521 100644
--- a/autoscaler-agent/config_map.yaml
+++ b/autoscaler-agent/config_map.yaml
@@ -26,6 +26,13 @@ data:
         "accumulateEverySeconds": 24,
         "clients": {}
       },
+      "scalingEvents": {
+        "cuMultiplier": 0.25,
+        "rereportThreshold": 0.25,
+        "clusterName": "replaceme",
+        "regionName": "replaceme",
+        "clients": {}
+      },
       "monitor": {
           "serverPort": 10301,
           "responseTimeoutSeconds": 5,
diff --git a/pkg/agent/config.go b/pkg/agent/config.go
index 254e7f6ae..93d219225 100644
--- a/pkg/agent/config.go
+++ b/pkg/agent/config.go
@@ -8,6 +8,7 @@ import (
 	"github.com/tychoish/fun/erc"
 
 	"github.com/neondatabase/autoscaling/pkg/agent/billing"
+	"github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
 	"github.com/neondatabase/autoscaling/pkg/api"
 	"github.com/neondatabase/autoscaling/pkg/reporting"
 )
@@ -15,12 +16,14 @@ import (
 type Config struct {
 	RefreshStateIntervalSeconds uint `json:"refereshStateIntervalSeconds"`
 
+	Billing       billing.Config       `json:"billing"`
+	ScalingEvents scalingevents.Config `json:"scalingEvents"`
+
 	Scaling   ScalingConfig    `json:"scaling"`
 	Metrics   MetricsConfig    `json:"metrics"`
 	Scheduler SchedulerConfig  `json:"scheduler"`
 	Monitor   MonitorConfig    `json:"monitor"`
 	NeonVM    NeonVMConfig     `json:"neonvm"`
-	Billing   billing.Config   `json:"billing"`
 	DumpState *DumpStateConfig `json:"dumpState"`
 }
 
@@ -193,6 +196,18 @@ func (c *Config) validate() error {
 		erc.Whenf(ec, c.Billing.Clients.S3.Region == "", emptyTmpl, ".billing.clients.s3.region")
 		erc.Whenf(ec, c.Billing.Clients.S3.PrefixInBucket == "", emptyTmpl, ".billing.clients.s3.prefixInBucket")
 	}
+
+	erc.Whenf(ec, c.ScalingEvents.CUMultiplier == 0, zeroTmpl, ".scalingEvents.cuMultiplier")
+	erc.Whenf(ec, c.ScalingEvents.RereportThreshold == 0, zeroTmpl, ".scalingEvents.rereportThreshold")
+	erc.Whenf(ec, c.ScalingEvents.ClusterName == "", emptyTmpl, ".scalingEvents.clusterName")
+	erc.Whenf(ec, c.ScalingEvents.RegionName == "", emptyTmpl, ".scalingEvents.regionName")
+	if c.ScalingEvents.Clients.S3 != nil {
+		validateBaseReportingConfig(&c.ScalingEvents.Clients.S3.BaseClientConfig, "scalingEvents.clients.s3")
+		erc.Whenf(ec, c.ScalingEvents.Clients.S3.Bucket == "", emptyTmpl, ".scalingEvents.clients.s3.bucket")
+		erc.Whenf(ec, c.ScalingEvents.Clients.S3.Region == "", emptyTmpl, ".scalingEvents.clients.s3.region")
+		erc.Whenf(ec, c.ScalingEvents.Clients.S3.PrefixInBucket == "", emptyTmpl, ".scalingEvents.clients.s3.prefixInBucket")
+	}
+
 	erc.Whenf(ec, c.DumpState != nil && c.DumpState.Port == 0, zeroTmpl, ".dumpState.port")
 	erc.Whenf(ec, c.DumpState != nil && c.DumpState.TimeoutSeconds == 0, zeroTmpl, ".dumpState.timeoutSeconds")
 
diff --git a/pkg/agent/core/goalcu.go b/pkg/agent/core/goalcu.go
index fd5785d79..8c733a66c 100644
--- a/pkg/agent/core/goalcu.go
+++ b/pkg/agent/core/goalcu.go
@@ -9,6 +9,7 @@ import (
 	"go.uber.org/zap"
 	"go.uber.org/zap/zapcore"
 
+	"github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
 	"github.com/neondatabase/autoscaling/pkg/api"
 )
 
@@ -19,6 +20,7 @@ type scalingGoal struct {
 
 func calculateGoalCU(
 	warn func(string),
+	report func(goalCU uint32, parts scalingevents.GoalCUComponents),
 	cfg api.ScalingConfig,
 	computeUnit api.Resources,
 	systemMetrics *SystemMetrics,
@@ -29,14 +31,16 @@ func calculateGoalCU(
 		warn("Making scaling decision without all required metrics available")
 	}
 
-	var lfcGoalCU, cpuGoalCU, memGoalCU, memTotalGoalCU uint32
+	var lfcGoalCU, cpuGoalCU, memGoalCU, memTotalGoalCU float64
 	var logFields []zap.Field
+	var reportedGoals scalingevents.GoalCUComponents
 
 	var wss *api.Bytes // estimated working set size
 
 	if lfcMetrics != nil {
 		var lfcLogFunc func(zapcore.ObjectEncoder) error
 		lfcGoalCU, wss, lfcLogFunc = calculateLFCGoalCU(warn, cfg, computeUnit, *lfcMetrics)
+		reportedGoals.LFC = lo.ToPtr(lfcGoalCU)
 		if lfcLogFunc != nil {
 			logFields = append(logFields, zap.Object("lfc", zapcore.ObjectMarshalerFunc(lfcLogFunc)))
 		}
@@ -44,15 +48,27 @@ func calculateGoalCU(
 
 	if systemMetrics != nil {
 		cpuGoalCU = calculateCPUGoalCU(cfg, computeUnit, *systemMetrics)
+		reportedGoals.CPU = lo.ToPtr(cpuGoalCU)
 
 		memGoalCU = calculateMemGoalCU(cfg, computeUnit, *systemMetrics)
+		reportedGoals.Mem = lo.ToPtr(memGoalCU)
 	}
 
 	if systemMetrics != nil && wss != nil {
 		memTotalGoalCU = calculateMemTotalGoalCU(cfg, computeUnit, *systemMetrics, *wss)
+		reportedGoals.Mem = lo.ToPtr(max(*reportedGoals.Mem, memTotalGoalCU))
 	}
 
-	goalCU := max(cpuGoalCU, memGoalCU, memTotalGoalCU, lfcGoalCU)
+	goalCU := uint32(math.Ceil(max(
+		math.Round(cpuGoalCU), // for historical compatibility, use round() instead of ceil()
+		memGoalCU,
+		memTotalGoalCU,
+		lfcGoalCU,
+	)))
+	if hasAllMetrics {
+		// Report this information, for scaling metrics.
+		report(goalCU, reportedGoals)
+	}
 
 	return scalingGoal{hasAllMetrics: hasAllMetrics, goalCU: goalCU}, logFields
 }
@@ -64,10 +80,9 @@ func calculateCPUGoalCU(
 	cfg api.ScalingConfig,
 	computeUnit api.Resources,
 	systemMetrics SystemMetrics,
-) uint32 {
+) float64 {
 	goalCPUs := systemMetrics.LoadAverage1Min / *cfg.LoadAverageFractionTarget
-	cpuGoalCU := uint32(math.Round(goalCPUs / computeUnit.VCPU.AsFloat64()))
-	return cpuGoalCU
+	return goalCPUs / computeUnit.VCPU.AsFloat64()
 }
 
 // For Mem:
@@ -78,13 +93,11 @@ func calculateMemGoalCU(
 	cfg api.ScalingConfig,
 	computeUnit api.Resources,
 	systemMetrics SystemMetrics,
-) uint32 {
+) float64 {
 	// goal memory size, just looking at allocated memory (not including page cache...)
-	memGoalBytes := api.Bytes(math.Round(systemMetrics.MemoryUsageBytes / *cfg.MemoryUsageFractionTarget))
+	memGoalBytes := math.Round(systemMetrics.MemoryUsageBytes / *cfg.MemoryUsageFractionTarget)
 
-	// note: this is equal to ceil(memGoalBytes / computeUnit.Mem), because ceil(X/M) == floor((X+M-1)/M)
-	memGoalCU := uint32((memGoalBytes + computeUnit.Mem - 1) / computeUnit.Mem)
-	return memGoalCU
+	return memGoalBytes / float64(computeUnit.Mem)
 }
 
 // goal memory size, looking at allocated memory and min(page cache usage, LFC working set size)
@@ -93,12 +106,11 @@ func calculateMemTotalGoalCU(
 	computeUnit api.Resources,
 	systemMetrics SystemMetrics,
 	wss api.Bytes,
-) uint32 {
+) float64 {
 	lfcCached := min(float64(wss), systemMetrics.MemoryCachedBytes)
-	totalGoalBytes := api.Bytes((lfcCached + systemMetrics.MemoryUsageBytes) / *cfg.MemoryTotalFractionTarget)
+	totalGoalBytes := (lfcCached + systemMetrics.MemoryUsageBytes) / *cfg.MemoryTotalFractionTarget
 
-	memTotalGoalCU := uint32((totalGoalBytes + computeUnit.Mem - 1) / computeUnit.Mem)
-	return memTotalGoalCU
+	return totalGoalBytes / float64(computeUnit.Mem)
 }
 
 func calculateLFCGoalCU(
@@ -106,7 +118,7 @@ func calculateLFCGoalCU(
 	cfg api.ScalingConfig,
 	computeUnit api.Resources,
 	lfcMetrics LFCMetrics,
-) (uint32, *api.Bytes, func(zapcore.ObjectEncoder) error) {
+) (float64, *api.Bytes, func(zapcore.ObjectEncoder) error) {
 	wssValues := lfcMetrics.ApproximateworkingSetSizeBuckets
 	// At this point, we can assume that the values are equally spaced at 1 minute apart,
 	// starting at 1 minute.
@@ -135,7 +147,6 @@ func calculateLFCGoalCU(
 		requiredMem := estimateWssMem / *cfg.LFCToMemoryRatio
 		// ... and then convert that into the actual CU required to fit the working set:
 		requiredCU := requiredMem / computeUnit.Mem.AsFloat64()
-		lfcGoalCU := uint32(math.Ceil(requiredCU))
 
 		lfcLogFields := func(obj zapcore.ObjectEncoder) error {
 			obj.AddFloat64("estimateWssPages", estimateWss)
@@ -144,6 +155,6 @@ func calculateLFCGoalCU(
 			return nil
 		}
 
-		return lfcGoalCU, lo.ToPtr(api.Bytes(estimateWssMem)), lfcLogFields
+		return requiredCU, lo.ToPtr(api.Bytes(estimateWssMem)), lfcLogFields
 	}
 }
diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go
index b7db040dd..114923694 100644
--- a/pkg/agent/core/state.go
+++ b/pkg/agent/core/state.go
@@ -31,6 +31,7 @@ import (
 
 	vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
 	"github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
+	"github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
 	"github.com/neondatabase/autoscaling/pkg/api"
 )
 
@@ -38,8 +39,14 @@ type ObservabilityCallbacks struct {
 	PluginLatency  revsource.ObserveCallback
 	MonitorLatency revsource.ObserveCallback
 	NeonVMLatency  revsource.ObserveCallback
+
+	ScalingEvent   ReportScalingEventCallback
+	DesiredScaling ReportDesiredScalingCallback
 }
 
+type ReportScalingEventCallback func(timestamp time.Time, current uint32, target uint32)
+type ReportDesiredScalingCallback func(timestamp time.Time, current uint32, target uint32, parts scalingevents.GoalCUComponents)
+
 type RevisionSource interface {
 	Next(ts time.Time, flags vmv1.Flag) vmv1.Revision
 	Observe(moment time.Time, rev vmv1.Revision) error
@@ -727,8 +734,20 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) (
 	// 2. Cap the goal CU by min/max, etc
 	// 3. that's it!
 
+	reportGoals := func(goalCU uint32, parts scalingevents.GoalCUComponents) {
+		currentCU, ok := s.VM.Using().DivResources(s.Config.ComputeUnit)
+		if !ok {
+			return // skip reporting if the current CU is not right.
+		}
+
+		if report := s.Config.ObservabilityCallbacks.DesiredScaling; report != nil {
+			report(now, uint32(currentCU), goalCU, parts)
+		}
+	}
+
 	sg, goalCULogFields := calculateGoalCU(
 		s.warn,
+		reportGoals,
 		s.scalingConfig(),
 		s.Config.ComputeUnit,
 		s.Metrics,
@@ -1220,6 +1239,15 @@ func (s *State) NeonVM() NeonVMHandle {
 }
 
 func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) {
+	if report := h.s.Config.ObservabilityCallbacks.ScalingEvent; report != nil {
+		currentCU, currentOk := h.s.VM.Using().DivResources(h.s.Config.ComputeUnit)
+		targetCU, targetOk := resources.DivResources(h.s.Config.ComputeUnit)
+
+		if currentOk && targetOk {
+			report(now, uint32(currentCU), uint32(targetCU))
+		}
+	}
+
 	// FIXME: add time to ongoing request info (or maybe only in RequestFailed?)
 	h.s.NeonVM.OngoingRequested = &resources
 }
diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go
index d975de870..1e5b6cd06 100644
--- a/pkg/agent/core/state_test.go
+++ b/pkg/agent/core/state_test.go
@@ -222,6 +222,8 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
 					AlwaysMigrate:        false,
 					ScalingEnabled:       true,
 					ScalingConfig:        nil,
+					ReportScalingEvents:  false,
+					ReportDesiredScaling: false,
 				},
 				CurrentRevision: nil,
 			}
@@ -257,6 +259,8 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
 					PluginLatency:  nil,
 					MonitorLatency: nil,
 					NeonVMLatency:  nil,
+					ScalingEvent:   nil,
+					DesiredScaling: nil,
 				},
 			}
 		}
@@ -342,6 +346,8 @@ var DefaultInitialStateConfig = helpers.InitialStateConfig{
 			PluginLatency:  nil,
 			MonitorLatency: nil,
 			NeonVMLatency:  nil,
+			ScalingEvent:   nil,
+			DesiredScaling: nil,
 		},
 	},
 }
diff --git a/pkg/agent/core/testhelpers/construct.go b/pkg/agent/core/testhelpers/construct.go
index a97e8a10b..95a2a4b66 100644
--- a/pkg/agent/core/testhelpers/construct.go
+++ b/pkg/agent/core/testhelpers/construct.go
@@ -85,6 +85,8 @@ func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo {
 			AlwaysMigrate:        false,
 			ScalingConfig:        nil,
 			ScalingEnabled:       true,
+			ReportScalingEvents:  false,
+			ReportDesiredScaling: false,
 		},
 		CurrentRevision: nil,
 	}
diff --git a/pkg/agent/entrypoint.go b/pkg/agent/entrypoint.go
index 38e07cec2..ae315161f 100644
--- a/pkg/agent/entrypoint.go
+++ b/pkg/agent/entrypoint.go
@@ -11,6 +11,7 @@ import (
 
 	vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
 	"github.com/neondatabase/autoscaling/pkg/agent/billing"
+	"github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
 	"github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
 	"github.com/neondatabase/autoscaling/pkg/util"
 	"github.com/neondatabase/autoscaling/pkg/util/taskgroup"
@@ -51,7 +52,13 @@ func (r MainRunner) Run(logger *zap.Logger, ctx context.Context) error {
 	}
 	defer schedTracker.Stop()
 
-	globalState, globalPromReg := r.newAgentState(logger, r.EnvArgs.K8sPodIP, schedTracker)
+	scalingEventsMetrics := scalingevents.NewPromMetrics()
+	scalingReporter, err := scalingevents.NewReporter(ctx, logger, &r.Config.ScalingEvents, scalingEventsMetrics)
+	if err != nil {
+		return fmt.Errorf("Error creating scaling events reporter: %w", err)
+	}
+
+	globalState, globalPromReg := r.newAgentState(logger, r.EnvArgs.K8sPodIP, schedTracker, scalingReporter)
 	watchMetrics.MustRegister(globalPromReg)
 
 	logger.Info("Starting billing metrics collector")
diff --git a/pkg/agent/globalstate.go b/pkg/agent/globalstate.go
index 3342a7cf7..69be386ce 100644
--- a/pkg/agent/globalstate.go
+++ b/pkg/agent/globalstate.go
@@ -17,6 +17,7 @@ import (
 
 	vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
 	vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
+	"github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
 	"github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
 	"github.com/neondatabase/autoscaling/pkg/api"
 	"github.com/neondatabase/autoscaling/pkg/util"
@@ -40,12 +41,15 @@ type agentState struct {
 	vmClient     *vmclient.Clientset
 	schedTracker *schedwatch.SchedulerTracker
 	metrics      GlobalMetrics
+
+	scalingReporter *scalingevents.Reporter
 }
 
 func (r MainRunner) newAgentState(
 	baseLogger *zap.Logger,
 	podIP string,
 	schedTracker *schedwatch.SchedulerTracker,
+	scalingReporter *scalingevents.Reporter,
 ) (*agentState, *prometheus.Registry) {
 	metrics, promReg := makeGlobalMetrics()
 
@@ -59,6 +63,8 @@ func (r MainRunner) newAgentState(
 		podIP:        podIP,
 		schedTracker: schedTracker,
 		metrics:      metrics,
+
+		scalingReporter: scalingReporter,
 	}
 
 	return state, promReg
diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go
index 7d0778922..ccdbf956a 100644
--- a/pkg/agent/runner.go
+++ b/pkg/agent/runner.go
@@ -20,6 +20,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"math"
 	"net/http"
 	"runtime/debug"
 	"strconv"
@@ -36,6 +37,7 @@ import (
 	"github.com/neondatabase/autoscaling/pkg/agent/core"
 	"github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
 	"github.com/neondatabase/autoscaling/pkg/agent/executor"
+	"github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
 	"github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
 	"github.com/neondatabase/autoscaling/pkg/api"
 	"github.com/neondatabase/autoscaling/pkg/util"
@@ -195,6 +197,11 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util
 	if vmInfo.CurrentRevision != nil {
 		initialRevision = vmInfo.CurrentRevision.Value
 	}
+	// "dsrl" stands for "desired scaling report limiter" -- helper to avoid spamming events.
+	dsrl := &desiredScalingReportLimiter{
+		lastTarget: nil,
+		lastParts:  nil,
+	}
 	revisionSource := revsource.NewRevisionSource(initialRevision, WrapHistogramVec(&r.global.metrics.scalingLatency))
 	executorCore := executor.NewExecutorCore(coreExecLogger, vmInfo, executor.Config{
 		OnNextActions: r.global.metrics.runnerNextActions.Inc,
@@ -217,6 +224,10 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util
 				PluginLatency:  WrapHistogramVec(&r.global.metrics.pluginLatency),
 				MonitorLatency: WrapHistogramVec(&r.global.metrics.monitorLatency),
 				NeonVMLatency:  WrapHistogramVec(&r.global.metrics.neonvmLatency),
+				ScalingEvent:   r.reportScalingEvent,
+				DesiredScaling: func(ts time.Time, current, target uint32, parts scalingevents.GoalCUComponents) {
+					r.reportDesiredScaling(dsrl, ts, current, target, parts)
+				},
 			},
 		},
 	})
@@ -322,6 +333,102 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util
 	}
 }
 
+func (r *Runner) reportScalingEvent(timestamp time.Time, currentCU, targetCU uint32) {
+	var endpointID string
+
+	enabled := func() bool {
+		r.status.mu.Lock()
+		defer r.status.mu.Unlock()
+
+		endpointID = r.status.endpointID
+		return endpointID != "" && r.status.vmInfo.Config.ReportScalingEvents
+	}()
+	if !enabled {
+		return
+	}
+
+	reporter := r.global.scalingReporter
+	reporter.Submit(reporter.NewRealEvent(
+		timestamp,
+		endpointID,
+		currentCU,
+		targetCU,
+	))
+}
+
+func (r *Runner) reportDesiredScaling(
+	rl *desiredScalingReportLimiter,
+	timestamp time.Time,
+	currentCU uint32,
+	targetCU uint32,
+	parts scalingevents.GoalCUComponents,
+) {
+	var endpointID string
+
+	enabled := func() bool {
+		r.status.mu.Lock()
+		defer r.status.mu.Unlock()
+
+		endpointID = r.status.endpointID
+		return endpointID != "" && r.status.vmInfo.Config.ReportDesiredScaling
+	}()
+	if !enabled {
+		return
+	}
+
+	// TODO: Use this opportunity to report the desired scaling in the per-VM
+	// metrics.
+
+	rl.report(r.global.scalingReporter, timestamp, endpointID, currentCU, targetCU, parts)
+}
+
+type desiredScalingReportLimiter struct {
+	lastTarget *uint32
+	lastParts  *scalingevents.GoalCUComponents
+}
+
+func (rl *desiredScalingReportLimiter) report(
+	reporter *scalingevents.Reporter,
+	timestamp time.Time,
+	endpointID string,
+	currentCU uint32,
+	targetCU uint32,
+	parts scalingevents.GoalCUComponents,
+) {
+	closeEnough := func(x *float64, y *float64) bool {
+		if (x != nil) != (y != nil) {
+			return false
+		} else if x == nil /* && y == nil */ {
+			return true
+		} else {
+			// true iff x and y are within the threshold of each other
+			return math.Abs(*x-*y) < 0.25
+		}
+	}
+
+	// Check if we should skip this time.
+	if rl.lastTarget != nil && rl.lastParts != nil {
+		skip := *rl.lastTarget == targetCU &&
+			closeEnough(rl.lastParts.CPU, parts.CPU) &&
+			closeEnough(rl.lastParts.Mem, parts.Mem) &&
+			closeEnough(rl.lastParts.LFC, parts.LFC)
+		if skip {
+			return
+		}
+	}
+
+	// Not skipping.
+	rl.lastTarget = &targetCU
+	rl.lastParts = &parts
+	reporter.Submit(reporter.NewHypotheticalEvent(
+		timestamp,
+		endpointID,
+		currentCU,
+		targetCU,
+		parts,
+	))
+}
+
 //////////////////////
 // Background tasks //
 //////////////////////
diff --git a/pkg/agent/scalingevents/clients.go b/pkg/agent/scalingevents/clients.go
new file mode 100644
index 000000000..de1d8e98c
--- /dev/null
+++ b/pkg/agent/scalingevents/clients.go
@@ -0,0 +1,64 @@
+package scalingevents
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/lithammer/shortuuid"
+	"go.uber.org/zap"
+
+	"github.com/neondatabase/autoscaling/pkg/reporting"
+)
+
+type ClientsConfig struct {
+	S3 *S3ClientConfig `json:"s3"`
+}
+
+type S3ClientConfig struct {
+	reporting.BaseClientConfig
+	reporting.S3ClientConfig
+	PrefixInBucket string `json:"prefixInBucket"`
+}
+
+type eventsClient = reporting.Client[ScalingEvent]
+
+func createClients(ctx context.Context, logger *zap.Logger, cfg ClientsConfig) ([]eventsClient, error) {
+	var clients []eventsClient
+
+	if c := cfg.S3; c != nil {
+		generateKey := newBlobStorageKeyGenerator(c.PrefixInBucket)
+		client, err := reporting.NewS3Client(ctx, c.S3ClientConfig, generateKey)
+		if err != nil {
+			return nil, fmt.Errorf("error creating S3 client: %w", err)
+		}
+		logger.Info("Created S3 client for scaling events", zap.Any("config", c))
+
+		clients = append(clients, eventsClient{
+			Name:            "s3",
+			Base:            client,
+			BaseConfig:      c.BaseClientConfig,
+			GenerateTraceID: shortuuid.New,
+			SerializeBatch:  reporting.WrapSerialize[ScalingEvent](reporting.GZIPCompress, reporting.JSONLinesMarshalBatch),
+		})
+	}
+
+	return clients, nil
+}
+
+// Returns a function to generate keys for the placement of scaling events data into blob storage.
+//
+// Example: prefix/2024/10/31/23/events_{uuid}.ndjson.gz (11pm on halloween, UTC)
+func newBlobStorageKeyGenerator(prefix string) func() string {
+	return func() string {
+		now := time.Now().UTC()
+		id := shortuuid.New()
+
+		return fmt.Sprintf(
+			"%s/%d/%02d/%02d/%02d/events_%s.ndjson.gz",
+			prefix,
+			now.Year(), now.Month(), now.Day(), now.Hour(),
+			id,
+		)
+	}
+}
diff --git a/pkg/agent/scalingevents/prommetrics.go b/pkg/agent/scalingevents/prommetrics.go
new file mode 100644
index 000000000..d2b827803
--- /dev/null
+++ b/pkg/agent/scalingevents/prommetrics.go
@@ -0,0 +1,29 @@
+package scalingevents
+
+// Prometheus metrics for the agent's scaling event reporting subsystem
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+
+	"github.com/neondatabase/autoscaling/pkg/reporting"
+)
+
+type PromMetrics struct {
+	reporting  *reporting.EventSinkMetrics
+	totalCount prometheus.Gauge
+}
+
+func NewPromMetrics() PromMetrics {
+	return PromMetrics{
+		reporting: reporting.NewEventSinkMetrics("autoscaling_agent_events"),
+		totalCount: prometheus.NewGauge(prometheus.GaugeOpts{
+			Name: "autoscaling_agent_scaling_events_total",
+			Help: "Total number of scaling events generated",
+		}),
+	}
+}
+
+func (m PromMetrics) MustRegister(reg *prometheus.Registry) {
+	m.reporting.MustRegister(reg)
+	reg.MustRegister(m.totalCount)
+}
diff --git a/pkg/agent/scalingevents/reporter.go b/pkg/agent/scalingevents/reporter.go
new file mode 100644
index 000000000..a7120be6c
--- /dev/null
+++ b/pkg/agent/scalingevents/reporter.go
@@ -0,0 +1,142 @@
+package scalingevents
+
+import (
+	"context"
+	"math"
+	"time"
+
+	"github.com/samber/lo"
+	"go.uber.org/zap"
+
+	"github.com/neondatabase/autoscaling/pkg/reporting"
+)
+
+type Config struct {
+	// CUMultiplier sets the ratio between our internal compute unit and the one that should be
+	// reported.
+	//
+	// This exists because Neon allows fractional compute units, while the autoscaler-agent acts on
+	// integer multiples of a smaller compute unit.
+	CUMultiplier float64 `json:"cuMultiplier"`
+
+	// RereportThreshold sets the minimum amount of change in desired compute units required for us to
+	// re-report the desired scaling.
+	RereportThreshold float64 `json:"rereportThreshold"`
+
+	ClusterName string `json:"clusterName"`
+	RegionName  string `json:"regionName"`
+
+	Clients ClientsConfig `json:"clients"`
+}
+
+type Reporter struct {
+	conf    *Config
+	sink    *reporting.EventSink[ScalingEvent]
+	metrics PromMetrics
+}
+
+type ScalingEvent struct {
+	Timestamp      time.Time         `json:"timestamp"`
+	Region         string            `json:"region"`
+	Cluster        string            `json:"cluster"`
+	EndpointID     string            `json:"endpoint_id"`
+	Type           scalingEventType  `json:"type"`
+	CurrentMilliCU uint32            `json:"current_cu"`
+	TargetMilliCU  uint32            `json:"target_cu"`
+	GoalComponents *GoalCUComponents `json:"goalComponents,omitempty"`
+}
+
+type GoalCUComponents struct {
+	CPU *float64 `json:"cpu,omitempty"`
+	Mem *float64 `json:"mem,omitempty"`
+	LFC *float64 `json:"lfc,omitempty"`
+}
+
+type scalingEventType string
+
+const (
+	scalingEventReal         = "real"
+	scalingEventHypothetical = "hypothetical"
+)
+
+func NewReporter(
+	ctx context.Context,
+	parentLogger *zap.Logger,
+	conf *Config,
+	metrics PromMetrics,
+) (*Reporter, error) {
+	logger := parentLogger.Named("scalingevents")
+
+	clients, err := createClients(ctx, logger, conf.Clients)
+	if err != nil {
+		return nil, err
+	}
+
+	sink := reporting.NewEventSink(logger, metrics.reporting, clients...)
+
+	return &Reporter{
+		conf:    conf,
+		sink:    sink,
+		metrics: metrics,
+	}, nil
+}
+
+// Submit adds the ScalingEvent to the sender queue(s), returning without waiting for it to be sent.
+func (r *Reporter) Submit(event ScalingEvent) {
+	r.sink.Enqueue(event)
+}
+
+func convertToMilliCU(cu uint32, multiplier float64) uint32 {
+	return uint32(math.Round(1000 * float64(cu) * multiplier))
+}
+
+// NewRealEvent is a helper function to create a ScalingEvent for actual scaling that has occurred.
+//
+// This method also handles compute unit translation.
+func (r *Reporter) NewRealEvent(
+	timestamp time.Time,
+	endpointID string,
+	currentCU uint32,
+	targetCU uint32,
+) ScalingEvent {
+	return ScalingEvent{
+		Timestamp:      timestamp,
+		Region:         r.conf.RegionName,
+		Cluster:        r.conf.ClusterName,
+		EndpointID:     endpointID,
+		Type:           scalingEventReal,
+		CurrentMilliCU: convertToMilliCU(currentCU, r.conf.CUMultiplier),
+		TargetMilliCU:  convertToMilliCU(targetCU, r.conf.CUMultiplier),
+		GoalComponents: nil,
+	}
+}
+
+func (r *Reporter) NewHypotheticalEvent(
+	timestamp time.Time,
+	endpointID string,
+	currentCU uint32,
+	targetCU uint32,
+	goalCUs GoalCUComponents,
+) ScalingEvent {
+	convertFloat := func(cu *float64) *float64 {
+		if cu != nil {
+			return lo.ToPtr(*cu * r.conf.CUMultiplier)
+		}
+		return nil
+	}
+
+	return ScalingEvent{
+		Timestamp:      timestamp,
+		Region:         r.conf.RegionName,
+		Cluster:        r.conf.ClusterName,
+		EndpointID:     endpointID,
+		Type:           scalingEventHypothetical,
+		CurrentMilliCU: convertToMilliCU(currentCU, r.conf.CUMultiplier),
+		TargetMilliCU:  convertToMilliCU(targetCU, r.conf.CUMultiplier),
+		GoalComponents: &GoalCUComponents{
+			CPU: convertFloat(goalCUs.CPU),
+			Mem: convertFloat(goalCUs.Mem),
+			LFC: convertFloat(goalCUs.LFC),
+		},
+	}
+}
diff --git a/pkg/api/types.go b/pkg/api/types.go
index d71570def..9cf6158a4 100644
--- a/pkg/api/types.go
+++ b/pkg/api/types.go
@@ -381,6 +381,23 @@ func (r Resources) Mul(factor uint16) Resources {
 	}
 }
 
+// DivResources divides the resources by the smaller amount, returning the uint16 value such that
+// other.Mul(factor) is equal to the original resources.
+//
+// If r is not an integer multiple of other, then (0, false) will be returned.
+func (r Resources) DivResources(other Resources) (uint16, bool) {
+	cpuFactor := uint16(r.VCPU / other.VCPU)
+	cpuOk := r.VCPU%other.VCPU == 0
+	memFactor := uint16(r.Mem / other.Mem)
+	memOk := r.Mem%other.Mem == 0
+
+	if !cpuOk || !memOk || cpuFactor != memFactor {
+		return 0, false
+	}
+
+	return cpuFactor, true // already known equal to memFactor
+}
+
 // AbsDiff returns a new Resources with each field F as the absolute value of the difference between
 // r.F and cmp.F
 func (r Resources) AbsDiff(cmp Resources) Resources {
diff --git a/pkg/api/vminfo.go b/pkg/api/vminfo.go
index e50c5f303..9a936415b 100644
--- a/pkg/api/vminfo.go
+++ b/pkg/api/vminfo.go
@@ -26,6 +26,10 @@ const (
 	AnnotationAutoscalingBounds   = "autoscaling.neon.tech/bounds"
 	AnnotationAutoscalingConfig   = "autoscaling.neon.tech/config"
 	AnnotationBillingEndpointID   = "autoscaling.neon.tech/billing-endpoint-id"
+
+	// ref cloud#15939; to be removed after rollout is complete.
+	LabelReportScalingEvents  = "autoscaling.neon.tech/report-scaling-events"
+	LabelReportDesiredScaling = "autoscaling.neon.tech/report-desired-scaling"
 )
 
 func hasTrueLabel(obj metav1.ObjectMetaAccessor, labelName string) bool {
@@ -49,6 +53,14 @@ func HasAlwaysMigrateLabel(obj metav1.ObjectMetaAccessor) bool {
 	return hasTrueLabel(obj, LabelTestingOnlyAlwaysMigrate)
 }
 
+func HasReportScalingEventsLabel(obj metav1.ObjectMetaAccessor) bool {
+	return hasTrueLabel(obj, LabelReportScalingEvents)
+}
+
+func HasReportDesiredScalingLabel(obj metav1.ObjectMetaAccessor) bool {
+	return hasTrueLabel(obj, LabelReportDesiredScaling)
+}
+
 // VmInfo is the subset of vmapi.VirtualMachineSpec that the scheduler plugin and autoscaler agent
 // care about. It takes various labels and annotations into account, so certain fields might be
 // different from what's strictly in the VirtualMachine object.
@@ -113,6 +125,9 @@ type VmConfig struct {
 	AlwaysMigrate  bool           `json:"alwaysMigrate"`
 	ScalingEnabled bool           `json:"scalingEnabled"`
 	ScalingConfig  *ScalingConfig `json:"scalingConfig,omitempty"`
+
+	ReportScalingEvents  bool `json:"reportScalingEvents"`
+	ReportDesiredScaling bool `json:"reportDesiredScaling"`
 }
 
 // Using returns the Resources that this VmInfo says the VM is using
@@ -186,6 +201,8 @@ func extractVmInfoGeneric(
 	autoMigrationEnabled := HasAutoMigrationEnabled(obj)
 	scalingEnabled := HasAutoscalingEnabled(obj)
 	alwaysMigrate := HasAlwaysMigrateLabel(obj)
+	reportScalingEvents := HasReportScalingEventsLabel(obj)
+	reportDesiredScaling := HasReportDesiredScalingLabel(obj)
 
 	info := VmInfo{
 		Name:      vmName,
@@ -197,6 +214,9 @@ func extractVmInfoGeneric(
 			AlwaysMigrate:        alwaysMigrate,
 			ScalingEnabled:       scalingEnabled,
 			ScalingConfig:        nil, // set below, maybe
+
+			ReportScalingEvents:  reportScalingEvents,
+			ReportDesiredScaling: reportDesiredScaling,
 		},
 		CurrentRevision: nil, // set later, maybe
 	}