Skip to content

Commit

Permalink
New health status check and report for the status api, it detects if …
Browse files Browse the repository at this point in the history
…the CommandApi is not returning a 200. For example if License Key is incorrect.
  • Loading branch information
alvarocabanas committed Aug 13, 2024
1 parent 723f535 commit ad15294
Show file tree
Hide file tree
Showing 7 changed files with 366 additions and 48 deletions.
2 changes: 1 addition & 1 deletion cmd/newrelic-infra/newrelic-infra.go
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ func initializeAgentAndRun(c *config.Config, logFwCfg config.LogForward) error {
// This should never happen, as the correct format is checked during NormalizeConfig.
aslog.WithError(err).Error("invalid startup_connection_timeout value, cannot run status server")
} else {
rep := status.NewReporter(agt.Context.Ctx, rlog, c.StatusEndpoints, timeoutD, transport, agt.Context.AgentIdnOrEmpty, agt.Context.EntityKey, c.License, userAgent)
rep := status.NewReporter(agt.Context.Ctx, rlog, c.StatusEndpoints, c.HealthEndpoint, timeoutD, transport, agt.Context.AgentIdnOrEmpty, agt.Context.EntityKey, c.License, userAgent)

apiSrv, err := httpapi.NewServer(rep, integrationEmitter)
if c.HTTPServerEnabled {
Expand Down
53 changes: 52 additions & 1 deletion internal/agent/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const (
// Report agent status report. It contains:
// - checks:
// - backend endpoints reachability statuses
// - backend communication healthiness
//
// - configuration
// fields will be empty when ReportErrors() report no errors.
Expand All @@ -31,6 +32,7 @@ type Report struct {

type ChecksReport struct {
Endpoints []EndpointReport `json:"endpoints,omitempty"`
Health HealthReport `json:"health,omitempty"`
}

// ConfigReport configuration used for status report.
Expand All @@ -45,6 +47,12 @@ type EndpointReport struct {
Error string `json:"error,omitempty"`
}

// HealthReport represents the backend communication healthiness status.
type HealthReport struct {
Healthy bool `json:"healthy"`
Error string `json:"error,omitempty"`
}

// ReportEntity agent entity report.
type ReportEntity struct {
GUID string `json:"guid"`
Expand All @@ -59,12 +67,15 @@ type Reporter interface {
ReportErrors() (Report, error)
// ReportEntity agent entity report.
ReportEntity() (ReportEntity, error)
// ReportHealth agent healthy report.
ReportHealth() HealthReport
}

type nrReporter struct {
ctx context.Context
log log.Entry
endpoints []string // NR backend URLs
healthEndpoint string // NR command backend URL to check communication healthiness
license string
userAgent string
idProvide id.Provide
Expand Down Expand Up @@ -119,8 +130,19 @@ func (r *nrReporter) report(onlyErrors bool) (report Report, err error) {
}(ep)
}

hReportC := make(chan HealthReport, 1)

wg.Add(1)

go func() {
hReportC <- r.getHealth(agentID)

wg.Done()
}()

wg.Wait()
close(eReportsC)
close(hReportC)

var errored bool
var eReports []EndpointReport
Expand All @@ -132,16 +154,17 @@ func (r *nrReporter) report(onlyErrors bool) (report Report, err error) {
errored = true
}
}
hreport := <-hReportC

if !onlyErrors || errored {
if report.Checks == nil {
report.Checks = &ChecksReport{}
}
report.Checks.Endpoints = eReports
report.Checks.Health = hreport
report.Config = &ConfigReport{
ReachabilityTimeout: r.timeout.String(),
}

}

return
Expand All @@ -154,11 +177,38 @@ func (r *nrReporter) ReportEntity() (re ReportEntity, err error) {
}, nil
}

func (r *nrReporter) ReportHealth() HealthReport {
agentID := r.idProvide().ID.String()
return r.getHealth(agentID)
}

func (r *nrReporter) getHealth(agentID string) HealthReport {
health, err := backendhttp.CheckEndpointHealthiness(
r.ctx,
r.healthEndpoint,
r.license,
r.userAgent,
agentID,
r.timeout,
r.transport,
)

healthReport := HealthReport{
Healthy: health,
}
if err != nil {
healthReport.Error = err.Error()
}

return healthReport
}

// NewReporter creates a new status reporter.
func NewReporter(
ctx context.Context,
l log.Entry,
backendEndpoints []string,
healthEndpoint string,
timeout time.Duration,
transport http.RoundTripper,
agentIDProvide id.Provide,
Expand All @@ -171,6 +221,7 @@ func NewReporter(
ctx: ctx,
log: l,
endpoints: backendEndpoints,
healthEndpoint: healthEndpoint,
license: license,
userAgent: userAgent,
idProvide: agentIDProvide,
Expand Down
163 changes: 135 additions & 28 deletions internal/agent/status/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package status

import (
"context"
http2 "github.com/newrelic/infrastructure-agent/pkg/backend/http"
"net/http"
"net/http/httptest"
"testing"
Expand All @@ -20,11 +21,17 @@ func TestNewReporter_Report(t *testing.T) {
w.WriteHeader(200)
}))
defer serverOk.Close()

serverTimeout := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(10 * time.Second)
}))
defer serverTimeout.Close()

serverUnauthorized := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(401)
}))
defer serverUnauthorized.Close()

assert.Eventually(t,
func() bool {
res, err := serverOk.Client().Get(serverOk.URL)
Expand All @@ -33,29 +40,54 @@ func TestNewReporter_Report(t *testing.T) {
time.Second, 10*time.Millisecond)

endpointsOk := []string{serverOk.URL}
healthEndpointOK := serverOk.URL
endpointsTimeout := []string{serverTimeout.URL}
healthEndpointTimeout := serverTimeout.URL
endpointsMixed := []string{serverOk.URL, serverTimeout.URL}
healthEndpointUnauthorized := serverUnauthorized.URL

expectReportOk := Report{Checks: &ChecksReport{Endpoints: []EndpointReport{{
URL: serverOk.URL,
Reachable: true,
}}}}
expectReportTimeout := Report{Checks: &ChecksReport{Endpoints: []EndpointReport{{
URL: serverTimeout.URL,
Reachable: false,
Error: endpointTimeoutMsg, // substring is enough, it'll assert via "string contains"
}}}}
expectReportMixed := Report{Checks: &ChecksReport{Endpoints: []EndpointReport{
{
URL: serverOk.URL,
Reachable: true,
expectReportOk := Report{Checks: &ChecksReport{
Endpoints: []EndpointReport{
{
URL: serverOk.URL,
Reachable: true,
},
},
{
URL: serverTimeout.URL,
Reachable: false,
Error: endpointTimeoutMsg,
Health: HealthReport{
Healthy: true,
},
}}}
}}

expectReportTimeout := Report{Checks: &ChecksReport{
Endpoints: []EndpointReport{
{
URL: serverTimeout.URL,
Error: endpointTimeoutMsg, // substring is enough, it'll assert via "string contains"
},
},
Health: HealthReport{
Healthy: false,
Error: "context deadline exceeded",
},
}}

expectReportMixed := Report{Checks: &ChecksReport{
Endpoints: []EndpointReport{
{
URL: serverOk.URL,
Reachable: true,
},
{
URL: serverTimeout.URL,
Reachable: false,
Error: endpointTimeoutMsg,
},
},
Health: HealthReport{
Healthy: false,
Error: http2.ErrUnexepectedResponseCode.Error(),
},
}}

timeout := 10 * time.Millisecond
transport := &http.Transport{}
Expand All @@ -66,19 +98,20 @@ func TestNewReporter_Report(t *testing.T) {
return ""
}
tests := []struct {
name string
endpoints []string
want Report
wantErr bool
name string
endpoints []string
healthEndpoint string
want Report
wantErr bool
}{
{"connectivity ok", endpointsOk, expectReportOk, false},
{"connectivity timedout", endpointsTimeout, expectReportTimeout, false},
{"connectivities ok and timeout", endpointsMixed, expectReportMixed, false},
{"connectivity ok", endpointsOk, healthEndpointOK, expectReportOk, false},
{"connectivity timedout", endpointsTimeout, healthEndpointTimeout, expectReportTimeout, false},
{"connectivities ok and timeout and unhealthy", endpointsMixed, healthEndpointUnauthorized, expectReportMixed, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
l := log.WithComponent(tt.name)
r := NewReporter(context.Background(), l, tt.endpoints, timeout, transport, emptyIDProvide, emptyEntityKeyProvider, "user-agent", "agent-key")
r := NewReporter(context.Background(), l, tt.endpoints, tt.healthEndpoint, timeout, transport, emptyIDProvide, emptyEntityKeyProvider, "user-agent", "agent-key")

got, err := r.Report()

Expand All @@ -103,6 +136,8 @@ func TestNewReporter_Report(t *testing.T) {
assert.Equal(t, expectedEndpoint.Reachable, gotEndpoint.Reachable)
assert.Contains(t, gotEndpoint.Error, expectedEndpoint.Error)
}
assert.Equal(t, tt.want.Checks.Health.Healthy, got.Checks.Health.Healthy)
assert.Contains(t, got.Checks.Health.Error, tt.want.Checks.Health.Error)
})
}
}
Expand All @@ -118,6 +153,7 @@ func TestNewReporter_ReportErrors(t *testing.T) {
defer serverTimeout.Close()

endpointsOk := []string{serverOk.URL}
healthEndpointOK := serverOk.URL
endpointsTimeout := []string{serverTimeout.URL}
endpointsMixed := []string{serverOk.URL, serverTimeout.URL}

Expand Down Expand Up @@ -156,7 +192,7 @@ func TestNewReporter_ReportErrors(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
l := log.WithComponent(tt.name)
r := NewReporter(context.Background(), l, tt.endpoints, timeout, transport, emptyIDProvide, emptyEntityKeyProvider, "user-agent", "agent-key")
r := NewReporter(context.Background(), l, tt.endpoints, healthEndpointOK, timeout, transport, emptyIDProvide, emptyEntityKeyProvider, "user-agent", "agent-key")

got, err := r.ReportErrors()

Expand Down Expand Up @@ -216,7 +252,7 @@ func TestNewReporter_ReportEntity(t *testing.T) {
entityKeyProvider := func() string {
return tt.entityKey
}
r := NewReporter(context.Background(), l, []string{}, timeout, transport, idProvide, entityKeyProvider, "user-agent", "agent-key")
r := NewReporter(context.Background(), l, []string{}, "", timeout, transport, idProvide, entityKeyProvider, "user-agent", "agent-key")

got, err := r.ReportEntity()

Expand All @@ -230,3 +266,74 @@ func TestNewReporter_ReportEntity(t *testing.T) {
})
}
}

func TestNewReporter_ReportHealth(t *testing.T) {
serverOk := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
}))
defer serverOk.Close()

serverTimeout := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(10 * time.Second)
}))
defer serverTimeout.Close()

serverUnauthorized := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(401)
}))
defer serverUnauthorized.Close()

assert.Eventually(t,
func() bool {
res, err := serverOk.Client().Get(serverOk.URL)

Check failure on line 288 in internal/agent/status/status_test.go

View workflow job for this annotation

GitHub Actions / linter-linux / Run Linter

response body must be closed (bodyclose)

Check failure on line 288 in internal/agent/status/status_test.go

View workflow job for this annotation

GitHub Actions / linter-macos / Lint tests

response body must be closed (bodyclose)

Check failure on line 288 in internal/agent/status/status_test.go

View workflow job for this annotation

GitHub Actions / linter-windows / Lint tests

response body must be closed (bodyclose)
return err == nil && res.StatusCode == 200
},
time.Second, 10*time.Millisecond)

healthEndpointOK := serverOk.URL
healthEndpointTimeout := serverTimeout.URL
healthEndpointUnauthorized := serverUnauthorized.URL

expectReportOk := HealthReport{
Healthy: true,
}

expectReportTimeout := HealthReport{
Healthy: false,
Error: "context deadline exceeded",
}

expectReportUnauthorized := HealthReport{
Healthy: false,
Error: http2.ErrUnexepectedResponseCode.Error(),
}

timeout := 10 * time.Millisecond
transport := &http.Transport{}
emptyIDProvide := func() entity.Identity {
return entity.EmptyIdentity
}
emptyEntityKeyProvider := func() string {
return ""
}
tests := []struct {
name string
healthEndpoint string
want HealthReport
}{
{"connectivity ok", healthEndpointOK, expectReportOk},
{"connectivity timedout", healthEndpointTimeout, expectReportTimeout},
{"unhealthy", healthEndpointUnauthorized, expectReportUnauthorized},
}
for _, tt := range tests {

Check failure on line 328 in internal/agent/status/status_test.go

View workflow job for this annotation

GitHub Actions / linter-linux / Run Linter

only one cuddle assignment allowed before range statement (wsl)

Check failure on line 328 in internal/agent/status/status_test.go

View workflow job for this annotation

GitHub Actions / linter-macos / Lint tests

only one cuddle assignment allowed before range statement (wsl)

Check failure on line 328 in internal/agent/status/status_test.go

View workflow job for this annotation

GitHub Actions / linter-windows / Lint tests

only one cuddle assignment allowed before range statement (wsl)
t.Run(tt.name, func(t *testing.T) {
l := log.WithComponent(tt.name)
r := NewReporter(context.Background(), l, nil, tt.healthEndpoint, timeout, transport, emptyIDProvide, emptyEntityKeyProvider, "user-agent", "agent-key")

got := r.ReportHealth()

assert.Equal(t, tt.want.Healthy, got.Healthy)
assert.Contains(t, got.Error, tt.want.Error)
})
}
}
Loading

0 comments on commit ad15294

Please sign in to comment.