Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix race condition in custom libbeat instrumentation #8900

Merged
merged 7 commits into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/head.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ https://github.com/elastic/apm-server/compare/8.5\...main[View commits]
==== Bug fixes
- Set `message` instead of `labels.event` for Jaeger span events {pull}8765[8765]
- Fix event loss during reload of TBS processor {pull}8809[8809]
- Fix race condition on custom libbeat instrumentation {pull}8900[8900]

[float]
==== Intake API Changes
Expand Down
41 changes: 30 additions & 11 deletions internal/beater/beater.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ func (bt *beater) run(ctx context.Context, cancelContext context.CancelFunc, b *
if b.Config != nil {
reloader.outputConfig = b.Config.Output
}
if err := reloader.reload(); err != nil {
if err := reloader.reloadOnce(); err != nil {
return err
}
}
Expand Down Expand Up @@ -289,6 +289,7 @@ func (r *reloader) Reload(configs []*reload.ConfigWithMeta) error {
}

r.mu.Lock()
defer r.mu.Unlock()
r.rawConfig = integrationConfig.APMServer
// Merge in datastream namespace passed in from apm integration
if integrationConfig.DataStream != nil && integrationConfig.DataStream.Namespace != "" {
Expand All @@ -300,7 +301,6 @@ func (r *reloader) Reload(configs []*reload.ConfigWithMeta) error {
}
}
r.fleetConfig = &integrationConfig.Fleet
r.mu.Unlock()
return r.reload()
}

Expand All @@ -312,14 +312,23 @@ func (r *reloader) reloadOutput(config *reload.ConfigWithMeta) error {
}
}
r.mu.Lock()
defer r.mu.Unlock()
r.outputConfig = outputConfig
r.mu.Unlock()
return r.reload()
}

func (r *reloader) reload() error {
func (r *reloader) reloadOnce() error {
r.mu.Lock()
defer r.mu.Unlock()

return r.reload()
lahsivjar marked this conversation as resolved.
Show resolved Hide resolved
}

// reload creates a new serverRunner, launches it in a new goroutine, waits
// for it to have successfully started and returns after waiting for the
// previous serverRunner to exit. Calls to reload must be sycnhronized
// explicitly by acquiring reloader#mu by callers.
func (r *reloader) reload() error {
if r.rawConfig == nil {
// APM Server config not loaded yet.
return nil
Expand All @@ -345,6 +354,17 @@ func (r *reloader) reload() error {
r.args.Logger.Error(err)
}
}()

// Wait for the new runner to start; this avoids the race condition in updating
// the monitoring#Deafult global registry inside the runner introduced due to two
// reloads (one for input and the other for output)
select {
case <-runner.done:
return errors.New("runner exited unexpectedly")
case <-runner.started:
// runner has started
Comment on lines +358 to +361
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch and great solution!

}

// If the old runner exists, cancel it
if r.runner != nil {
r.runner.cancelRunServerContext()
Expand All @@ -364,6 +384,7 @@ type serverRunner struct {
// immediately when the Stop method is invoked.
runServerContext context.Context
cancelRunServerContext context.CancelFunc
started chan struct{}
done chan struct{}

pipeline beat.PipelineConnector
Expand Down Expand Up @@ -416,6 +437,7 @@ func newServerRunner(ctx context.Context, args serverRunnerParams) (*serverRunne
runServerContext: runServerContext,
cancelRunServerContext: cancel,
done: make(chan struct{}),
started: make(chan struct{}),

config: cfg,
rawConfig: args.RawConfig,
Expand Down Expand Up @@ -581,6 +603,10 @@ func (s *serverRunner) run(listener net.Listener) error {
NewElasticsearchClient: newElasticsearchClient,
})
})

// Signal that the runner has started
close(s.started)

result := g.Wait()
if err := closeFinalBatchProcessor(s.backgroundContext); err != nil {
result = multierror.Append(result, err)
Expand Down Expand Up @@ -664,19 +690,12 @@ func (s *serverRunner) waitReady(ctx context.Context, kibanaClient kibana.Client
return waitReady(ctx, s.config.WaitReadyInterval, s.tracer, s.logger, check)
}

// This mutex must be held when updating the libbeat monitoring registry,
// as there may be multiple servers running concurrently.
var monitoringRegistryMu sync.Mutex
Copy link
Contributor Author

@lahsivjar lahsivjar Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[For reviewers] IIUC, this mutex is not required. Earlier the mutex was preventing the default monitoring registry to be accessed concurrently however, that was not enough to prevent race as a temporary reload's run method might run later which will result in an inconsistent state in the default registry.

The current PR makes sure that run is called only once during reload thus not requiring this mutex anymore. Let me know if my understanding here is incorrect.


// newFinalBatchProcessor returns the final model.BatchProcessor that publishes events,
// and a cleanup function which should be called on server shutdown. If the output is
// "elasticsearch", then we use modelindexer; otherwise we use the libbeat publisher.
func (s *serverRunner) newFinalBatchProcessor(
newElasticsearchClient func(cfg *elasticsearch.Config) (elasticsearch.Client, error),
) (model.BatchProcessor, func(context.Context) error, error) {
monitoringRegistryMu.Lock()
defer monitoringRegistryMu.Unlock()

if s.elasticsearchOutputConfig == nil {
// When the publisher stops cleanly it will close its pipeline client,
// calling the acker's Close method. We need to call Open for each new
Expand Down