Skip to content

Commit

Permalink
Merge branch 'main' into enable-self-instrumentation-tracing
Browse files Browse the repository at this point in the history
  • Loading branch information
1pkg committed Oct 22, 2024
2 parents 25e7663 + 6c70484 commit 289871a
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 20 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// tag::list[]
* <<apm-release-notes-8.16>>
* <<apm-release-notes-8.15>>
* <<apm-release-notes-8.14>>
* <<apm-release-notes-8.13>>
Expand All @@ -19,6 +20,7 @@
// tag::includes[]
include::./changelogs/head.asciidoc[]
include::./changelogs/8.16.asciidoc[]
include::./changelogs/8.15.asciidoc[]
include::./changelogs/8.14.asciidoc[]
include::./changelogs/8.13.asciidoc[]
Expand Down
33 changes: 33 additions & 0 deletions changelogs/8.16.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[[apm-release-notes-8.16]]
== APM version 8.16
* <<apm-release-notes-8.16.0>>

[float]
[[apm-release-notes-8.16.0]]
=== APM version 8.16.0

https://github.com/elastic/apm-server/compare/v8.15.2\...v8.16.0[View commits]

[float]
==== Bug fixes

- Track all bulk request response status codes {pull}13574[13574]
- Fix a concurrent map write panic in monitoring middleware {pull}14335[14335]
- Apply shutdown timeout to http server {pull}14339[14339]
- Tail-based sampling: Fix rare gc thread failure after EA hot reload causing storage not reclaimed and stuck with "storage limit reached" {pull}13574[13574]

[float]
==== Breaking Changes

[float]
==== Deprecations
- Support for Jaeger is now deprecated, and will be removed in a future release {pull}13809[13809]

[float]
==== Intake API Changes

[float]
==== Added

- APM Server will no longer retry an HTTP request that returned 502s, 503s, 504s. It will only retry 429s. {pull}13523[13523]
- APM Server now supports emitting distributed tracing for its own operation when running under Elastic Agent, and adds support for configuring a sampling rate {pull}14231[14231]
13 changes: 1 addition & 12 deletions changelogs/head.asciidoc
Original file line number Diff line number Diff line change
@@ -1,27 +1,16 @@
[[release-notes-head]]
== APM version HEAD

https://github.com/elastic/apm-server/compare/8.15\...main[View commits]

[float]
==== Bug fixes

- Track all bulk request response status codes {pull}13574[13574]
- Fix a concurrent map write panic in monitoring middleware {pull}14335[14335]
- Apply shutdown timeout to http server {pull}14339[14339]
https://github.com/elastic/apm-server/compare/8.16\...8.x[View commits]

[float]
==== Breaking Changes

[float]
==== Deprecations
- Support for Jaeger is now deprecated, and will be removed in a future release {pull}13809[13809]

[float]
==== Intake API Changes

[float]
==== Added

- APM Server will no longer retry an HTTP request that returned 502s, 503s, 504s. It will only retry 429s. {pull}13523[13523]
- APM Server now supports emitting distributed tracing for its own operation when running under Elastic Agent, and adds support for configuring a sampling rate {pull}14231[14231]
4 changes: 2 additions & 2 deletions systemtest/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ require (
github.com/hashicorp/go-multierror v1.1.1
github.com/jaegertracing/jaeger v1.62.0
github.com/stretchr/testify v1.9.0
github.com/testcontainers/testcontainers-go v0.33.0
github.com/testcontainers/testcontainers-go v0.34.0
github.com/tidwall/gjson v1.18.0
go.elastic.co/apm/v2 v2.6.2
go.elastic.co/fastjson v1.4.0
Expand Down Expand Up @@ -47,7 +47,7 @@ require (
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/containerd/log v0.1.0 // indirect
github.com/containerd/platforms v0.2.1 // indirect
github.com/cpuguy83/dockercfg v0.3.1 // indirect
github.com/cpuguy83/dockercfg v0.3.2 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/go-units v0.5.0 // indirect
Expand Down
9 changes: 5 additions & 4 deletions systemtest/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A=
github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw=
github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E=
github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA=
github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -143,6 +143,7 @@ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
Expand All @@ -151,8 +152,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/testcontainers/testcontainers-go v0.33.0 h1:zJS9PfXYT5O0ZFXM2xxXfk4J5UMw/kRiISng037Gxdw=
github.com/testcontainers/testcontainers-go v0.33.0/go.mod h1:W80YpTa8D5C3Yy16icheD01UTDu+LmXIA2Keo+jWtT8=
github.com/testcontainers/testcontainers-go v0.34.0 h1:5fbgF0vIN5u+nD3IWabQwRybuB4GY8G2HHgCkbMzMHo=
github.com/testcontainers/testcontainers-go v0.34.0/go.mod h1:6P/kMkQe8yqPHfPWNulFGdFHTD8HB2vLq/231xY2iPQ=
github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
Expand Down
2 changes: 1 addition & 1 deletion testing/apmsoak/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ terraform.tfvars:

.PHONY: use-production
use-production:
@terraform workspace select production || terraform worksapce new production
@terraform workspace select production || terraform workspace new production

.PHONY: apmsoak
apmsoak:
Expand Down
28 changes: 27 additions & 1 deletion x-pack/apm-server/sampling/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ const (
shutdownGracePeriod = 5 * time.Second
)

var (
// gcCh works like a global mutex to protect gc from running concurrently when 2 TBS processors are active during a hot reload
gcCh = make(chan struct{}, 1)
)

// Processor is a tail-sampling event processor.
type Processor struct {
config Config
Expand Down Expand Up @@ -386,6 +391,16 @@ func (p *Processor) Run() error {
}
})
g.Go(func() error {
// Protect this goroutine from running concurrently when 2 TBS processors are active
// as badger GC is not concurrent safe.
select {
case <-p.stopping:
return nil
case gcCh <- struct{}{}:
}
defer func() {
<-gcCh
}()
// This goroutine is responsible for periodically garbage
// collecting the Badger value log, using the recommended
// discard ratio of 0.5.
Expand All @@ -411,7 +426,9 @@ func (p *Processor) Run() error {
})
g.Go(func() error {
// Subscribe to remotely sampled trace IDs. This is cancelled immediately when
// Stop is called. The next subscriber will pick up from the previous position.
// Stop is called. But it is possible that both old and new subscriber goroutines
// run concurrently, before the old one eventually receives the Stop call.
// The next subscriber will pick up from the previous position.
defer close(remoteSampledTraceIDs)
defer close(subscriberPositions)
ctx, cancel := context.WithCancel(context.Background())
Expand Down Expand Up @@ -558,7 +575,13 @@ func (p *Processor) Run() error {
return nil
}

// subscriberPositionFileMutex protects the subscriber file from concurrent RW, in case of hot reload.
var subscriberPositionFileMutex sync.Mutex

func readSubscriberPosition(logger *logp.Logger, storageDir string) (pubsub.SubscriberPosition, error) {
subscriberPositionFileMutex.Lock()
defer subscriberPositionFileMutex.Unlock()

var pos pubsub.SubscriberPosition
data, err := os.ReadFile(filepath.Join(storageDir, subscriberPositionFile))
if errors.Is(err, os.ErrNotExist) {
Expand All @@ -579,6 +602,9 @@ func writeSubscriberPosition(storageDir string, pos pubsub.SubscriberPosition) e
if err != nil {
return err
}

subscriberPositionFileMutex.Lock()
defer subscriberPositionFileMutex.Unlock()
return os.WriteFile(filepath.Join(storageDir, subscriberPositionFile), data, 0644)
}

Expand Down
26 changes: 26 additions & 0 deletions x-pack/apm-server/sampling/processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/pkg/errors"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"
"google.golang.org/protobuf/testing/protocmp"

"github.com/elastic/apm-data/model/modelpb"
Expand Down Expand Up @@ -668,6 +669,31 @@ func TestStorageGC(t *testing.T) {
t.Fatal("timed out waiting for value log garbage collection")
}

func TestStorageGCConcurrency(t *testing.T) {
// This test ensures that TBS processor does not return an error
// even when run concurrently e.g. in hot reload
if testing.Short() {
t.Skip("skipping slow test")
}

config := newTempdirConfig(t)
config.TTL = 10 * time.Millisecond
config.FlushInterval = 10 * time.Millisecond
config.StorageGCInterval = 10 * time.Millisecond

g := errgroup.Group{}
for i := 0; i < 2; i++ {
processor, err := sampling.NewProcessor(config)
require.NoError(t, err)
g.Go(processor.Run)
go func() {
time.Sleep(time.Second)
assert.NoError(t, processor.Stop(context.Background()))
}()
}
assert.NoError(t, g.Wait())
}

func TestStorageLimit(t *testing.T) {
// This test ensures that when tail sampling is configured with a hard
// storage limit, the limit is respected once the size is available.
Expand Down

0 comments on commit 289871a

Please sign in to comment.