diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index dffd289c9bc..f31ddeb5dc8 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -1,4 +1,5 @@ // tag::list[] +* <> * <> * <> * <> @@ -19,6 +20,7 @@ // tag::includes[] include::./changelogs/head.asciidoc[] +include::./changelogs/8.16.asciidoc[] include::./changelogs/8.15.asciidoc[] include::./changelogs/8.14.asciidoc[] include::./changelogs/8.13.asciidoc[] diff --git a/changelogs/8.16.asciidoc b/changelogs/8.16.asciidoc new file mode 100644 index 00000000000..b4d50ab837f --- /dev/null +++ b/changelogs/8.16.asciidoc @@ -0,0 +1,33 @@ +[[apm-release-notes-8.16]] +== APM version 8.16 +* <> + +[float] +[[apm-release-notes-8.16.0]] +=== APM version 8.16.0 + +https://github.com/elastic/apm-server/compare/v8.15.2\...v8.16.0[View commits] + +[float] +==== Bug fixes + +- Track all bulk request response status codes {pull}13574[13574] +- Fix a concurrent map write panic in monitoring middleware {pull}14335[14335] +- Apply shutdown timeout to http server {pull}14339[14339] +- Tail-based sampling: Fix rare gc thread failure after EA hot reload causing storage not reclaimed and stuck with "storage limit reached" {pull}13574[13574] + +[float] +==== Breaking Changes + +[float] +==== Deprecations +- Support for Jaeger is now deprecated, and will be removed in a future release {pull}13809[13809] + +[float] +==== Intake API Changes + +[float] +==== Added + +- APM Server will no longer retry an HTTP request that returned 502s, 503s, 504s. It will only retry 429s. {pull}13523[13523] +- APM Server now supports emitting distributed tracing for its own operation when running under Elastic Agent, and adds support for configuring a sampling rate {pull}14231[14231] diff --git a/changelogs/head.asciidoc b/changelogs/head.asciidoc index ebaf0ae7941..4b71ea29c89 100644 --- a/changelogs/head.asciidoc +++ b/changelogs/head.asciidoc @@ -1,27 +1,16 @@ [[release-notes-head]] == APM version HEAD -https://github.com/elastic/apm-server/compare/8.15\...main[View commits] - -[float] -==== Bug fixes - -- Track all bulk request response status codes {pull}13574[13574] -- Fix a concurrent map write panic in monitoring middleware {pull}14335[14335] -- Apply shutdown timeout to http server {pull}14339[14339] +https://github.com/elastic/apm-server/compare/8.16\...8.x[View commits] [float] ==== Breaking Changes [float] ==== Deprecations -- Support for Jaeger is now deprecated, and will be removed in a future release {pull}13809[13809] [float] ==== Intake API Changes [float] ==== Added - -- APM Server will no longer retry an HTTP request that returned 502s, 503s, 504s. It will only retry 429s. {pull}13523[13523] -- APM Server now supports emitting distributed tracing for its own operation when running under Elastic Agent, and adds support for configuring a sampling rate {pull}14231[14231] \ No newline at end of file diff --git a/systemtest/go.mod b/systemtest/go.mod index b64ab4a2f0e..fea21551317 100644 --- a/systemtest/go.mod +++ b/systemtest/go.mod @@ -14,7 +14,7 @@ require ( github.com/hashicorp/go-multierror v1.1.1 github.com/jaegertracing/jaeger v1.62.0 github.com/stretchr/testify v1.9.0 - github.com/testcontainers/testcontainers-go v0.33.0 + github.com/testcontainers/testcontainers-go v0.34.0 github.com/tidwall/gjson v1.18.0 go.elastic.co/apm/v2 v2.6.2 go.elastic.co/fastjson v1.4.0 @@ -47,7 +47,7 @@ require ( github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/containerd/log v0.1.0 // indirect github.com/containerd/platforms v0.2.1 // indirect - github.com/cpuguy83/dockercfg v0.3.1 // indirect + github.com/cpuguy83/dockercfg v0.3.2 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/distribution/reference v0.6.0 // indirect github.com/docker/go-units v0.5.0 // indirect diff --git a/systemtest/go.sum b/systemtest/go.sum index 3d818b4e9d0..86ef6461e73 100644 --- a/systemtest/go.sum +++ b/systemtest/go.sum @@ -17,8 +17,8 @@ github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A= github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= -github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= -github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= +github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA= +github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -143,6 +143,7 @@ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -151,8 +152,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/testcontainers/testcontainers-go v0.33.0 h1:zJS9PfXYT5O0ZFXM2xxXfk4J5UMw/kRiISng037Gxdw= -github.com/testcontainers/testcontainers-go v0.33.0/go.mod h1:W80YpTa8D5C3Yy16icheD01UTDu+LmXIA2Keo+jWtT8= +github.com/testcontainers/testcontainers-go v0.34.0 h1:5fbgF0vIN5u+nD3IWabQwRybuB4GY8G2HHgCkbMzMHo= +github.com/testcontainers/testcontainers-go v0.34.0/go.mod h1:6P/kMkQe8yqPHfPWNulFGdFHTD8HB2vLq/231xY2iPQ= github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= diff --git a/testing/apmsoak/Makefile b/testing/apmsoak/Makefile index a3302140970..12d1bf8438e 100644 --- a/testing/apmsoak/Makefile +++ b/testing/apmsoak/Makefile @@ -14,7 +14,7 @@ terraform.tfvars: .PHONY: use-production use-production: - @terraform workspace select production || terraform worksapce new production + @terraform workspace select production || terraform workspace new production .PHONY: apmsoak apmsoak: diff --git a/x-pack/apm-server/sampling/processor.go b/x-pack/apm-server/sampling/processor.go index 82dc2df59aa..9402408b299 100644 --- a/x-pack/apm-server/sampling/processor.go +++ b/x-pack/apm-server/sampling/processor.go @@ -40,6 +40,11 @@ const ( shutdownGracePeriod = 5 * time.Second ) +var ( + // gcCh works like a global mutex to protect gc from running concurrently when 2 TBS processors are active during a hot reload + gcCh = make(chan struct{}, 1) +) + // Processor is a tail-sampling event processor. type Processor struct { config Config @@ -386,6 +391,16 @@ func (p *Processor) Run() error { } }) g.Go(func() error { + // Protect this goroutine from running concurrently when 2 TBS processors are active + // as badger GC is not concurrent safe. + select { + case <-p.stopping: + return nil + case gcCh <- struct{}{}: + } + defer func() { + <-gcCh + }() // This goroutine is responsible for periodically garbage // collecting the Badger value log, using the recommended // discard ratio of 0.5. @@ -411,7 +426,9 @@ func (p *Processor) Run() error { }) g.Go(func() error { // Subscribe to remotely sampled trace IDs. This is cancelled immediately when - // Stop is called. The next subscriber will pick up from the previous position. + // Stop is called. But it is possible that both old and new subscriber goroutines + // run concurrently, before the old one eventually receives the Stop call. + // The next subscriber will pick up from the previous position. defer close(remoteSampledTraceIDs) defer close(subscriberPositions) ctx, cancel := context.WithCancel(context.Background()) @@ -558,7 +575,13 @@ func (p *Processor) Run() error { return nil } +// subscriberPositionFileMutex protects the subscriber file from concurrent RW, in case of hot reload. +var subscriberPositionFileMutex sync.Mutex + func readSubscriberPosition(logger *logp.Logger, storageDir string) (pubsub.SubscriberPosition, error) { + subscriberPositionFileMutex.Lock() + defer subscriberPositionFileMutex.Unlock() + var pos pubsub.SubscriberPosition data, err := os.ReadFile(filepath.Join(storageDir, subscriberPositionFile)) if errors.Is(err, os.ErrNotExist) { @@ -579,6 +602,9 @@ func writeSubscriberPosition(storageDir string, pos pubsub.SubscriberPosition) e if err != nil { return err } + + subscriberPositionFileMutex.Lock() + defer subscriberPositionFileMutex.Unlock() return os.WriteFile(filepath.Join(storageDir, subscriberPositionFile), data, 0644) } diff --git a/x-pack/apm-server/sampling/processor_test.go b/x-pack/apm-server/sampling/processor_test.go index f17500da501..e0bf38f77b2 100644 --- a/x-pack/apm-server/sampling/processor_test.go +++ b/x-pack/apm-server/sampling/processor_test.go @@ -22,6 +22,7 @@ import ( "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" "google.golang.org/protobuf/testing/protocmp" "github.com/elastic/apm-data/model/modelpb" @@ -668,6 +669,31 @@ func TestStorageGC(t *testing.T) { t.Fatal("timed out waiting for value log garbage collection") } +func TestStorageGCConcurrency(t *testing.T) { + // This test ensures that TBS processor does not return an error + // even when run concurrently e.g. in hot reload + if testing.Short() { + t.Skip("skipping slow test") + } + + config := newTempdirConfig(t) + config.TTL = 10 * time.Millisecond + config.FlushInterval = 10 * time.Millisecond + config.StorageGCInterval = 10 * time.Millisecond + + g := errgroup.Group{} + for i := 0; i < 2; i++ { + processor, err := sampling.NewProcessor(config) + require.NoError(t, err) + g.Go(processor.Run) + go func() { + time.Sleep(time.Second) + assert.NoError(t, processor.Stop(context.Background())) + }() + } + assert.NoError(t, g.Wait()) +} + func TestStorageLimit(t *testing.T) { // This test ensures that when tail sampling is configured with a hard // storage limit, the limit is respected once the size is available.