Scale service destination based on available memory (#11739)

elastic · Oct 2, 2023 · 60f6ac5 · 60f6ac5
1 parent 1df20b9
commit 60f6ac5
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 24 deletions.
diff --git a/changelogs/head.asciidoc b/changelogs/head.asciidoc
@@ -15,6 +15,7 @@ https://github.com/elastic/apm-server/compare/8.10\...main[View commits]
 - Add back gzip support for grpc otlp endpoint {pull}11434[11434]
 - Correctly mark jvm.memory.non_heap.pool.* and jvm.fd.* metrics as internal {pull}11303[11303]
 - Fix tail-based sampling discarding low throughput and low sample rate traces {pull}11642[11642]
+- Add memory based autoscaling for service destination aggregation groups {pull}11739[11739]
 
 [float]
 ==== Intake API Changes

diff --git a/docs/data-model.asciidoc b/docs/data-model.asciidoc
@@ -548,9 +548,10 @@ which is 500 transaction groups per service per GB of APM Server.
 ** For service-transaction metrics, there is an additional limit of 1000 total service transaction groups per GB of APM Server,
 and each service may only consume up to 10% of the service transaction groups,
 which is 100 service transaction groups per service per GB of APM Server.
-** For service-destination metrics, there is an additional limit of a constant 10000 total service destination groups,
+** For service-destination metrics, there is an additional limit of 5000 total service destination groups per GB of APM Server
+starting with 10000 service destination groups for 1 GB APM Server,
 and each service may only consume up to 10% of the service destination groups,
-which is 1000 service destination groups per service.
+which is 1000 service destination groups for 1GB APM Server with 500 increment per GB of APM Server.
 ** For service-summary metrics, there is no additional limit.
 
 In the above, a service is defined as a combination of `service.name`, `service.environment`, `service.language.name` and `agent.name`.

diff --git a/internal/beater/beater.go b/internal/beater/beater.go
@@ -230,23 +230,30 @@ func (s *Runner) Run(ctx context.Context) error {
 	}
 
 	if s.config.Aggregation.MaxServices <= 0 {
-		s.config.Aggregation.MaxServices = maxGroupsForAggregation(memLimitGB)
+		s.config.Aggregation.MaxServices = linearScaledValue(1_000, memLimitGB, 0)
 		s.logger.Infof("Aggregation.MaxServices set to %d based on %0.1fgb of memory",
 			s.config.Aggregation.MaxServices, memLimitGB,
 		)
 	}
 
+	if s.config.Aggregation.ServiceTransactions.MaxGroups <= 0 {
+		s.config.Aggregation.ServiceTransactions.MaxGroups = linearScaledValue(1_000, memLimitGB, 0)
+		s.logger.Infof("Aggregation.ServiceTransactions.MaxGroups for service aggregation set to %d based on %0.1fgb of memory",
+			s.config.Aggregation.ServiceTransactions.MaxGroups, memLimitGB,
+		)
+	}
+
 	if s.config.Aggregation.Transactions.MaxGroups <= 0 {
-		s.config.Aggregation.Transactions.MaxGroups = maxTxGroupsForAggregation(memLimitGB)
+		s.config.Aggregation.Transactions.MaxGroups = linearScaledValue(5_000, memLimitGB, 0)
 		s.logger.Infof("Aggregation.Transactions.MaxGroups set to %d based on %0.1fgb of memory",
 			s.config.Aggregation.Transactions.MaxGroups, memLimitGB,
 		)
 	}
 
-	if s.config.Aggregation.ServiceTransactions.MaxGroups <= 0 {
-		s.config.Aggregation.ServiceTransactions.MaxGroups = maxGroupsForAggregation(memLimitGB)
-		s.logger.Infof("Aggregation.ServiceTransactions.MaxGroups for service aggregation set to %d based on %0.1fgb of memory",
-			s.config.Aggregation.ServiceTransactions.MaxGroups, memLimitGB,
+	if s.config.Aggregation.ServiceDestinations.MaxGroups <= 0 {
+		s.config.Aggregation.ServiceDestinations.MaxGroups = linearScaledValue(5_000, memLimitGB, 5_000)
+		s.logger.Infof("Aggregation.ServiceDestinations.MaxGroups set to %d based on %0.1fgb of memory",
+			s.config.Aggregation.Transactions.MaxGroups, memLimitGB,
 		)
 	}
 
@@ -568,26 +575,14 @@ func maxConcurrentDecoders(memLimitGB float64) uint {
 	return decoders
 }
 
-// maxGroupsForAggregation calculates the maximum service groups that a
-// particular memory limit can have. This will be scaled linearly for bigger
-// instances.
-func maxGroupsForAggregation(memLimitGB float64) int {
-	const maxMemGB = 64
-	if memLimitGB > maxMemGB {
-		memLimitGB = maxMemGB
-	}
-	return int(memLimitGB * 1_000)
-}
-
-// maxTxGroupsForAggregation calculates the maximum transaction groups that a
-// particular memory limit can have. This will be scaled linearly for bigger
-// instances.
-func maxTxGroupsForAggregation(memLimitGB float64) int {
+// linearScaledValue calculates linearly scaled value based on memory limit using
+// the formula y = (perGBIncrement * memLimitGB) + constant
+func linearScaledValue(perGBIncrement, memLimitGB, constant float64) int {
 	const maxMemGB = 64
 	if memLimitGB > maxMemGB {
 		memLimitGB = maxMemGB
 	}
-	return int(memLimitGB * 5_000)
+	return int(memLimitGB*perGBIncrement + constant)
 }
 
 // waitReady waits until the server is ready to index events.